From 4a5f4d2f891bcff7285b5f7490ed5a7a5d516049 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Jun 2018 05:56:12 -0700 Subject: genirq: Use rcu in kstat_irqs_usr() Jeremy Dorfman identified mutex contention when multiple threads parse /proc/stat concurrently. Since commit 425a5072dcd1 ("genirq: Free irq_desc with rcu"), kstat_irqs_usr() can be switched to rcu locking, which removes this mutex contention. show_interrupts() case will be handled in a separate patch. Reported-by: Jeremy Dorfman Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Eric Dumazet Cc: Willem de Bruijn Link: https://lkml.kernel.org/r/20180618125612.155057-1-edumazet@google.com --- kernel/irq/irqdesc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index afc7f902d74a..578d0e5f1b5b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -443,6 +443,7 @@ static void free_desc(unsigned int irq) * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. + * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } @@ -928,17 +929,17 @@ unsigned int kstat_irqs(unsigned int irq) * kstat_irqs_usr - Get the statistics for an interrupt * @irq: The interrupt number * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. Contrary to kstat_irqs() this can be called from any - * preemptible context. It's protected against concurrent removal of - * an interrupt descriptor when sparse irqs are enabled. + * Returns the sum of interrupt counts on all cpus since boot for @irq. + * Contrary to kstat_irqs() this can be called from any context. + * It uses rcu since a concurrent removal of an interrupt descriptor is + * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; - irq_lock_sparse(); + rcu_read_lock(); sum = kstat_irqs(irq); - irq_unlock_sparse(); + rcu_read_unlock(); return sum; } -- cgit v1.2.3 From 0a13ec0bbc42bddf90ab6a444df8aaa67c148b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sun, 17 Jun 2018 14:40:18 +0200 Subject: genirq: Fix editing error in a comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the comment was reflowed to a wider format, the "*" snuck in. Fixes: ae88a23b32fa ("irq: refactor and clean up the free_irq() code flow") Signed-off-by: Jonathan Neuschäfer Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180617124018.25539-1-j.neuschaefer@gmx.net --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index daeabd791d58..591cfe901162 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1638,7 +1638,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a - * 'real' IRQ doesn't run in * parallel with our fake. ) + * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); -- cgit v1.2.3 From f5a89295e2f566d8e9f1d4f3d524d8d3c966958c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:08:00 +0200 Subject: time: Use ktime_get_real_seconds() in time syscall Both get_seconds() and do_gettimeofday() are deprecated. Change the time() implementation to use the replacement function instead. Obviously the system call will still overflow in 2038, but this gets us closer to removing the old helper functions. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: y2038@lists.linaro.org Cc: Stephen Boyd Cc: Deepa Dinamani Cc: Al Viro Link: https://lkml.kernel.org/r/20180618140811.2998503-2-arnd@arndb.de --- kernel/time/time.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 6fa99213fc72..b1225db61eb2 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL(sys_tz); */ SYSCALL_DEFINE1(time, time_t __user *, tloc) { - time_t i = get_seconds(); + time_t i = (time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -106,11 +106,9 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) /* compat_time_t is a 32 bit "long" and needs to get converted. */ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) { - struct timeval tv; compat_time_t i; - do_gettimeofday(&tv); - i = tv.tv_sec; + i = (compat_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) -- cgit v1.2.3 From d30faff900e666f9a6395a159fdd353c02f5bed0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:08:01 +0200 Subject: timekeeping: Use ktime_get_real_ts64() instead of getnstimeofday64() The two do the same, this moves all users to the newer name for consistency. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: y2038@lists.linaro.org Cc: Stephen Boyd Cc: Miroslav Lichvar Link: https://lkml.kernel.org/r/20180618140811.2998503-3-arnd@arndb.de --- kernel/time/ntp.c | 6 +++--- kernel/time/timekeeping.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a09ded765f6c..10a79053e82f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -502,7 +502,7 @@ static void sched_sync_hw_clock(struct timespec64 now, { struct timespec64 next; - getnstimeofday64(&next); + ktime_get_real_ts64(&next); if (!fail) next.tv_sec = 659; else { @@ -537,7 +537,7 @@ static void sync_rtc_clock(void) if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) return; - getnstimeofday64(&now); + ktime_get_real_ts64(&now); adjust = now; if (persistent_clock_is_local) @@ -591,7 +591,7 @@ static bool sync_cmos_clock(void) * Architectures are strongly encouraged to use rtclib and not * implement this legacy API. */ - getnstimeofday64(&now); + ktime_get_real_ts64(&now); if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4786df904c22..77c436a0070b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2310,7 +2310,7 @@ int do_adjtimex(struct timex *txc) return ret; } - getnstimeofday64(&ts); + ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); -- cgit v1.2.3 From 58a10456d7175fa674b951f1fd7ead3d9a550db4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:32:24 +0200 Subject: posix-timers: Use new ktime_get_*_ts64() helpers Some of the oddly named time accessor functions now have a more consistent naming, which should be used from now on so the aliases can be removed. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: y2038@lists.linaro.org Cc: Deepa Dinamani Cc: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20180618143246.3865099-1-arnd@arndb.de --- kernel/time/posix-stubs.c | 2 +- kernel/time/posix-timers.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 26aa9569e24a..2c6847d5d69b 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -81,7 +81,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) ktime_get_ts64(tp); break; case CLOCK_BOOTTIME: - get_monotonic_boottime64(tp); + ktime_get_boottime_ts64(tp); break; default: return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e08ce3f27447..fcf90a10c43a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -228,21 +228,21 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { - getrawmonotonic64(tp); + ktime_get_raw_ts64(tp); return 0; } static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { - *tp = current_kernel_time64(); + ktime_get_coarse_real_ts64(tp); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { - *tp = get_monotonic_coarse64(); + ktime_get_coarse_ts64(tp); return 0; } @@ -254,13 +254,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime64(tp); + ktime_get_boottime_ts64(tp); return 0; } static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai64(tp); + ktime_get_clocktai_ts64(tp); return 0; } -- cgit v1.2.3 From dc1b7b6ca9d5fa905c164a667086d8a3d8605875 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 17:00:38 +0200 Subject: sysinfo: Remove get_monotonic_boottime() get_monotonic_boottime() is deprecated because it uses the old 'timespec' structure. This replaces one of the last callers with a call to ktime_get_boottime. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Cyrill Gorcunov Cc: Andrew Morton Cc: y2038@lists.linaro.org Cc: Dominik Brodowski Cc: Cyrill Gorcunov Link: https://lkml.kernel.org/r/20180618150114.849216-1-arnd@arndb.de --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 38509dc1f77b..e27b51d3facd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2512,11 +2512,11 @@ static int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - struct timespec tp; + struct timespec64 tp; memset(info, 0, sizeof(struct sysinfo)); - get_monotonic_boottime(&tp); + ktime_get_boottime_ts64(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -- cgit v1.2.3 From 29c1372d6a9b872acf479ba2744e4e7f043981c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ondrej=20Mosn=C3=A1=C4=8Dek?= Date: Wed, 30 May 2018 10:45:24 +0200 Subject: audit: allow other filter list types for AUDIT_EXE This patch removes the restriction of the AUDIT_EXE field to only SYSCALL filter and teaches audit_filter to recognize this field. This makes it possible to write rule lists such as: auditctl -a exit,always [some general rule] # Filter out events with executable name /bin/exe1 or /bin/exe2: auditctl -a exclude,always -F exe=/bin/exe1 auditctl -a exclude,always -F exe=/bin/exe2 See: https://github.com/linux-audit/audit-kernel/issues/54 Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eaa320148d97..6db9847ca031 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -428,8 +428,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; - if (entry->rule.listnr != AUDIT_FILTER_EXIT) - return -EINVAL; break; } return 0; @@ -1360,6 +1358,11 @@ int audit_filter(int msgtype, unsigned int listtype) f->type, f->op, f->lsm_rule, NULL); } break; + case AUDIT_EXE: + result = audit_exe_compare(current, e->rule.exe); + if (f->op == Audit_not_equal) + result = !result; + break; default: goto unlock_and_return; } -- cgit v1.2.3 From 9b8753fffe7b3642688135f28aa8a0a0f45fd9ab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 31 May 2018 16:27:24 -0400 Subject: audit: tie SECCOMP records to syscall Since seccomp events are triggered by user activity, tie the SECCOMP record to the syscall record to collect all records from the same event. See: https://github.com/linux-audit/audit-kernel/issues/87 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..fefb9e215cd0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2485,7 +2485,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); -- cgit v1.2.3 From d87de4a878e110d0061fb22726d37a54a281285d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 31 May 2018 16:28:12 -0400 Subject: audit: tie ANOM_ABEND records to syscall Since core dump events are triggered by user activity, tie the ANOM_ABEND record to the syscall record to collect all records from the same event. See: https://github.com/linux-audit/audit-kernel/issues/88 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fefb9e215cd0..5f0bd5ece578 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2461,7 +2461,7 @@ void audit_core_dumps(long signr) if (signr == SIGQUIT) /* don't care for those */ return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); -- cgit v1.2.3 From af85d1772e31fed34165a1b3decef340cf4080c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ondrej=20Mosn=C3=A1=C4=8Dek?= Date: Tue, 5 Jun 2018 11:00:10 +0200 Subject: audit: Fix extended comparison of GID/EGID The audit_filter_rules() function in auditsc.c used the in_[e]group_p() functions to check GID/EGID match, but these functions use the current task's credentials, while the comparison should use the credentials of the task given to audit_filter_rules() as a parameter (tsk). Note that we can use group_search(cred->group_info, ...) as a replacement for both in_group_p and in_egroup_p as these functions only compare the parameter to cred->fsgid/egid and then call group_search. In fact, the usage of in_group_p was even more incorrect: it compares to cred->fsgid (which is usually equal to cred->egid) and not cred->gid. GitHub issue: https://github.com/linux-audit/audit-kernel/issues/82 Fixes: 37eebe39c973 ("audit: improve GID/EGID comparation logic") Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- kernel/auditsc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5f0bd5ece578..d762e0b8160e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -494,20 +494,20 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_group_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_group_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) - result = in_egroup_p(f->gid); + result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) - result = !in_egroup_p(f->gid); + result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: -- cgit v1.2.3 From d904ac0320d3c4ff4e9d80e4294ca5dde803696f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 5 Jun 2018 11:45:07 -0400 Subject: audit: rename FILTER_TYPE to FILTER_EXCLUDE The AUDIT_FILTER_TYPE name is vague and misleading due to not describing where or when the filter is applied and obsolete due to its available filter fields having been expanded. Userspace has already renamed it from AUDIT_FILTER_TYPE to AUDIT_FILTER_EXCLUDE without checking if it already exists. The userspace maintainer assures that as long as it is set to the same value it will not be a problem since the userspace code does not treat compiler warnings as errors. If this policy changes then checks if it already exists can be added at the same time. See: https://github.com/linux-audit/audit-kernel/issues/89 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditfilter.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e7478cb58079..5c0a1d7b0c7b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1754,7 +1754,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE))) return NULL; /* NOTE: don't ever fail/sleep on these two conditions: diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6db9847ca031..bf309f2592c4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -264,7 +264,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * case AUDIT_FILTER_TASK: #endif case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: ; } @@ -337,7 +337,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch(f->type) { case AUDIT_MSGTYPE: - if (entry->rule.listnr != AUDIT_FILTER_TYPE && + if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; @@ -929,7 +929,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1011,7 +1011,7 @@ int audit_del_rule(struct audit_entry *entry) /* If any of these, don't count towards total */ switch(entry->rule.listnr) { case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: dont_count = 1; } @@ -1372,7 +1372,7 @@ int audit_filter(int msgtype, unsigned int listtype) break; } if (result > 0) { - if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE) ret = 0; break; } -- cgit v1.2.3 From f7859590d97614815b35a755c8213dfb8f2766bd Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 5 Jun 2018 19:20:39 -0400 Subject: audit: eliminate audit_enabled magic number comparison Remove comparison of audit_enabled to magic numbers outside of audit. Related: https://github.com/linux-audit/audit-kernel/issues/86 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5c0a1d7b0c7b..0f3222e4edde 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -83,9 +83,6 @@ #define AUDIT_INITIALIZED 1 static int audit_initialized; -#define AUDIT_OFF 0 -#define AUDIT_ON 1 -#define AUDIT_LOCKED 2 u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; -- cgit v1.2.3 From 6519750210d9d3330e85d12e0128652edde448d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:50 +0200 Subject: sched/swait: Remove __prepare_to_swait There is no public user of this API, remove it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.157076812@infradead.org --- kernel/sched/swait.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b6fb2c3b3ff7..e68bb1398b05 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -69,7 +69,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) -- cgit v1.2.3 From 0abf17bc7790dd0467ed0e38522242f23c5da1c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:51 +0200 Subject: sched/swait: Switch to full exclusive mode Linus noted that swait basically implements exclusive mode -- because swake_up() only wakes a single waiter. And because of that it should take care to properly deal with the interruptible case. In short, the problem is that swake_up() can race with a signal. In this this case it is possible the swake_up() 'wakes' the waiter that is already on the way out because it just got a signal and the wakeup gets lost. The normal wait code is very careful and avoids this situation, make sure we do too. Copy the exact exclusive semantics from wait. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.209762413@infradead.org --- kernel/sched/swait.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e68bb1398b05..66890de93ee5 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -73,7 +73,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w { wait->task = current; if (list_empty(&wait->task_list)) - list_add(&wait->task_list, &q->task_list); + list_add_tail(&wait->task_list, &q->task_list); } void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) @@ -89,12 +89,24 @@ EXPORT_SYMBOL(prepare_to_swait); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { - if (signal_pending_state(state, current)) - return -ERESTARTSYS; + unsigned long flags; + long ret = 0; - prepare_to_swait(q, wait, state); + raw_spin_lock_irqsave(&q->lock, flags); + if (unlikely(signal_pending_state(state, current))) { + /* + * See prepare_to_wait_event(). TL;DR, subsequent swake_up() + * must not see us. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + __prepare_to_swait(q, wait); + set_current_state(state); + } + raw_spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_swait_event); -- cgit v1.2.3 From b3dae109fa89d67334bf3349babab3ad9b6f233f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jun 2018 10:34:52 +0200 Subject: sched/swait: Rename to exclusive Since swait basically implemented exclusive waits only, make sure the API reflects that. $ git grep -l -e "\" -e "\" | while read file; do sed -i -e 's/\/&_one/g' -e 's/\/&_exclusive/g' $file; done With a few manual touch-ups. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: bigeasy@linutronix.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180612083909.261946548@infradead.org --- kernel/power/suspend.c | 4 ++-- kernel/rcu/srcutiny.c | 4 ++-- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree_exp.h | 4 ++-- kernel/rcu/tree_plugin.h | 12 ++++++------ kernel/sched/swait.c | 10 +++++----- 6 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..70178f6ffdc4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -92,7 +92,7 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - swait_event(s2idle_wait_head, + swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); @@ -160,7 +160,7 @@ void s2idle_wake(void) raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - swake_up(&s2idle_wait_head); + swake_up_one(&s2idle_wait_head); } raw_spin_unlock_irqrestore(&s2idle_lock, flags); } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 622792abe41a..04fc2ed71af8 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(sp->srcu_gp_waiting)) - swake_up(&sp->srcu_wq); + swake_up_one(&sp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp) idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ /* Invoke the callbacks we removed above. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..91f888d3b23a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1727,7 +1727,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - swake_up(&rsp->gp_wq); + swake_up_one(&rsp->gp_wq); } /* @@ -2002,7 +2002,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for swait_event_idle() wakeup at force-quiescent-state + * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) @@ -2144,7 +2144,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ @@ -2176,7 +2176,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..d428cc1064c8 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); + swake_up_one(&rsp->expedited_wq); } break; } @@ -518,7 +518,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = swait_event_timeout( + ret = swait_event_timeout_exclusive( rsp->expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..ad53d133f709 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1854,8 +1854,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ - swake_up(&rdp_leader->nocb_wq); + smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ + swake_up_one(&rdp_leader->nocb_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -2082,7 +2082,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - swait_event_interruptible( + swait_event_interruptible_exclusive( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2111,7 +2111,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = true; @@ -2176,7 +2176,7 @@ wait_again: raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* List was empty, so wake up the follower. */ - swake_up(&rdp->nocb_wq); + swake_up_one(&rdp->nocb_wq); } } @@ -2193,7 +2193,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 66890de93ee5..66b59ac77c22 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); -void swake_up(struct swait_queue_head *q) +void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q) swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(swake_up); +EXPORT_SYMBOL(swake_up_one); /* * Does not allow usage from IRQ disabled, since we must be able to @@ -76,7 +76,7 @@ static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *w list_add_tail(&wait->task_list, &q->task_list); } -void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state) { unsigned long flags; @@ -85,7 +85,7 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int set_current_state(state); raw_spin_unlock_irqrestore(&q->lock, flags); } -EXPORT_SYMBOL(prepare_to_swait); +EXPORT_SYMBOL(prepare_to_swait_exclusive); long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) { @@ -95,7 +95,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait raw_spin_lock_irqsave(&q->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* - * See prepare_to_wait_event(). TL;DR, subsequent swake_up() + * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() * must not see us. */ list_del_init(&wait->task_list); -- cgit v1.2.3 From 5a6cf77f5e35e7af35d36a1e7dc21a42f6412e4f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:05:07 +0900 Subject: kprobes: Remove jprobe API implementation Remove functionally empty jprobe API implementations and test cases. Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/lkml/152942430705.15209.2307050500995264322.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 78 +----------------------------------------- kernel/test_kprobes.c | 94 --------------------------------------------------- 2 files changed, 1 insertion(+), 171 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ea619021d901..69de130595f7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1272,7 +1272,7 @@ NOKPROBE_SYMBOL(cleanup_rp_inst); /* * Add the new probe to ap->list. Fail if this is the -* second jprobe at the address - two jprobes can't coexist +* second break_handler at the address */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { @@ -1812,77 +1812,6 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -#if 0 -int register_jprobes(struct jprobe **jps, int num) -{ - int ret = 0, i; - - if (num <= 0) - return -EINVAL; - - for (i = 0; i < num; i++) { - ret = register_jprobe(jps[i]); - - if (ret < 0) { - if (i > 0) - unregister_jprobes(jps, i); - break; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(register_jprobes); - -int register_jprobe(struct jprobe *jp) -{ - unsigned long addr, offset; - struct kprobe *kp = &jp->kp; - - /* - * Verify probepoint as well as the jprobe handler are - * valid function entry points. - */ - addr = arch_deref_entry_point(jp->entry); - - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && - kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { - kp->pre_handler = setjmp_pre_handler; - kp->break_handler = longjmp_break_handler; - return register_kprobe(kp); - } - - return -EINVAL; -} -EXPORT_SYMBOL_GPL(register_jprobe); - -void unregister_jprobe(struct jprobe *jp) -{ - unregister_jprobes(&jp, 1); -} -EXPORT_SYMBOL_GPL(unregister_jprobe); - -void unregister_jprobes(struct jprobe **jps, int num) -{ - int i; - - if (num <= 0) - return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(&jps[i]->kp) < 0) - jps[i]->kp.addr = NULL; - mutex_unlock(&kprobe_mutex); - - synchronize_sched(); - for (i = 0; i < num; i++) { - if (jps[i]->kp.addr) - __unregister_kprobe_bottom(&jps[i]->kp); - } -} -EXPORT_SYMBOL_GPL(unregister_jprobes); -#endif - #ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe @@ -2329,8 +2258,6 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; - else if (p->pre_handler == setjmp_pre_handler) - kprobe_type = "j"; else kprobe_type = "k"; @@ -2637,6 +2564,3 @@ late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */ module_init(init_kprobes); - -/* defined in arch/.../kernel/kprobes.c */ -EXPORT_SYMBOL_GPL(jprobe_return); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index dd53e354f630..7bca480151b0 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -162,90 +162,6 @@ static int test_kprobes(void) } -#if 0 -static u32 jph_val; - -static u32 j_kprobe_target(u32 value) -{ - if (preemptible()) { - handler_errors++; - pr_err("jprobe-handler is preemptible\n"); - } - if (value != rand1) { - handler_errors++; - pr_err("incorrect value in jprobe handler\n"); - } - - jph_val = rand1; - jprobe_return(); - return 0; -} - -static struct jprobe jp = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target" -}; - -static int test_jprobe(void) -{ - int ret; - - ret = register_jprobe(&jp); - if (ret < 0) { - pr_err("register_jprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); - unregister_jprobe(&jp); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - return 0; -} - -static struct jprobe jp2 = { - .entry = j_kprobe_target, - .kp.symbol_name = "kprobe_target2" -}; - -static int test_jprobes(void) -{ - int ret; - struct jprobe *jps[2] = {&jp, &jp2}; - - /* addr and flags should be cleard for reusing kprobe. */ - jp.kp.addr = NULL; - jp.kp.flags = 0; - ret = register_jprobes(jps, 2); - if (ret < 0) { - pr_err("register_jprobes returned %d\n", ret); - return ret; - } - - jph_val = 0; - ret = target(rand1); - if (jph_val == 0) { - pr_err("jprobe handler not called\n"); - handler_errors++; - } - - jph_val = 0; - ret = target2(rand1); - if (jph_val == 0) { - pr_err("jprobe handler2 not called\n"); - handler_errors++; - } - unregister_jprobes(jps, 2); - - return 0; -} -#else -#define test_jprobe() (0) -#define test_jprobes() (0) -#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -383,16 +299,6 @@ int init_test_probes(void) if (ret < 0) errors++; - num_tests++; - ret = test_jprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_jprobes(); - if (ret < 0) - errors++; - #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); -- cgit v1.2.3 From 059053a275b56eff1db11605c68988a6e9818561 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:10:27 +0900 Subject: kprobes: Don't check the ->break_handler() in generic kprobes code Don't check the ->break_handler() from the core kprobes code, because it was only used by jprobes which got removed. ( In followup patches we'll remove the remaining calls in low level arch handlers as well and remove the callback altogether. ) Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/lkml/152942462686.15209.6324404940493598980.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 69de130595f7..536ab451e96d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -627,8 +627,8 @@ static void optimize_kprobe(struct kprobe *p) (kprobe_disabled(p) || kprobes_all_disarmed)) return; - /* Both of break_handler and post_handler are not supported. */ - if (p->break_handler || p->post_handler) + /* kprobes with post_handler can not be optimized */ + if (p->post_handler) return; op = container_of(p, struct optimized_kprobe, kp); @@ -1116,20 +1116,6 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_fault_handler); -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe *cur = __this_cpu_read(kprobe_instance); - int ret = 0; - - if (cur && cur->break_handler) { - if (cur->break_handler(cur, regs)) - ret = 1; - } - reset_kprobe_instance(); - return ret; -} -NOKPROBE_SYMBOL(aggr_break_handler); - /* Walks the list and increments nmissed count for multiprobe case */ void kprobes_inc_nmissed_count(struct kprobe *p) { @@ -1270,24 +1256,15 @@ static void cleanup_rp_inst(struct kretprobe *rp) } NOKPROBE_SYMBOL(cleanup_rp_inst); -/* -* Add the new probe to ap->list. Fail if this is the -* second break_handler at the address -*/ +/* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); - if (p->break_handler || p->post_handler) + if (p->post_handler) unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ - if (p->break_handler) { - if (ap->break_handler) - return -EEXIST; - list_add_tail_rcu(&p->list, &ap->list); - ap->break_handler = aggr_break_handler; - } else - list_add_rcu(&p->list, &ap->list); + list_add_rcu(&p->list, &ap->list); if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; @@ -1310,8 +1287,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) /* We don't care the kprobe which has gone. */ if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); INIT_HLIST_NODE(&ap->hlist); @@ -1706,8 +1681,6 @@ static int __unregister_kprobe_top(struct kprobe *p) goto disarmed; else { /* If disabling probe has special handlers, update aggrprobe */ - if (p->break_handler && !kprobe_gone(p)) - ap->break_handler = NULL; if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) @@ -1911,7 +1884,6 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; rp->kp.fault_handler = NULL; - rp->kp.break_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { @@ -2034,7 +2006,6 @@ static void kill_kprobe(struct kprobe *p) list_for_each_entry_rcu(kp, &p->list, list) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; - p->break_handler = NULL; kill_optimized_kprobe(p); } /* -- cgit v1.2.3 From cce188bd58cfbd603b904dbce75f34de2eff959a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 20 Jun 2018 01:15:45 +0900 Subject: bpf/error-inject/kprobes: Clear current_kprobe and enable preempt in kprobe Clear current_kprobe and enable preemption in kprobe even if pre_handler returns !0. This simplifies function override using kprobes. Jprobe used to require to keep the preemption disabled and keep current_kprobe until it returned to original function entry. For this reason kprobe_int3_handler() and similar arch dependent kprobe handers checks pre_handler result and exit without enabling preemption if the result is !0. After removing the jprobe, Kprobes does not need to keep preempt disabled even if user handler returns !0 anymore. But since the function override handler in error-inject and bpf is also returns !0 if it overrides a function, to balancing the preempt count, it enables preemption and reset current kprobe by itself. That is a bad design that is very buggy. This fixes such unbalanced preempt-count and current_kprobes setting in kprobes, bpf and error-inject. Note: for powerpc and x86, this removes all preempt_disable from kprobe_ftrace_handler because ftrace callbacks are called under preempt disabled. Signed-off-by: Masami Hiramatsu Acked-by: Thomas Gleixner Acked-by: Naveen N. Rao Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: David S. Miller Cc: Fenghua Yu Cc: Heiko Carstens Cc: James Hogan Cc: Josef Bacik Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Steven Rostedt Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-ia64@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Link: https://lore.kernel.org/lkml/152942494574.15209.12323837825873032258.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/fail_function.c | 3 --- kernel/trace/trace_kprobe.c | 11 +++-------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 5349c91c2298..bc80a4e268c0 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -184,9 +184,6 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) if (should_fail(&fei_fault_attr, 1)) { regs_set_return_value(regs, attr->retval); override_function_with_return(regs); - /* Kprobe specific fixup */ - reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..7e3b944b6ac1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1217,16 +1217,11 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the single stepping. - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. + * pt_regs, and if so return 1 so that we don't do the + * single stepping. */ - if (orig_ip != instruction_pointer(regs)) { - reset_current_kprobe(); - preempt_enable_no_resched(); + if (orig_ip != instruction_pointer(regs)) return 1; - } if (!ret) return 0; } -- cgit v1.2.3 From ba2591a5993eabcc8e874e30f361d8ffbb10d6d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 May 2018 16:43:46 +0200 Subject: sched/smt: Update sched_smt_present at runtime The static key sched_smt_present is only updated at boot time when SMT siblings have been detected. Booting with maxcpus=1 and bringing the siblings online after boot rebuilds the scheduling domains correctly but does not update the static key, so the SMT code is not enabled. Let the key be updated in the scheduler CPU hotplug code to fix this. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/sched/core.c | 30 ++++++++++++------------------ kernel/sched/fair.c | 1 + 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..0f79bb5ffb5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5777,6 +5777,18 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; +#ifdef CONFIG_SCHED_SMT + /* + * The sched_smt_present static key needs to be evaluated on every + * hotplug event because at boot time SMT might be disabled when + * the number of booted CPUs is limited. + * + * If then later a sibling gets hotplugged, then the key would stay + * off and SMT scheduling would never be functional. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) > 1) + static_branch_enable_cpuslocked(&sched_smt_present); +#endif set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -5874,22 +5886,6 @@ int sched_cpu_dying(unsigned int cpu) } #endif -#ifdef CONFIG_SCHED_SMT -DEFINE_STATIC_KEY_FALSE(sched_smt_present); - -static void sched_init_smt(void) -{ - /* - * We've enumerated all CPUs and will assume that if any CPU - * has SMT siblings, CPU0 will too. - */ - if (cpumask_weight(cpu_smt_mask(0)) > 1) - static_branch_enable(&sched_smt_present); -} -#else -static inline void sched_init_smt(void) { } -#endif - void __init sched_init_smp(void) { sched_init_numa(); @@ -5911,8 +5907,6 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); - sched_init_smt(); - sched_smp_initialized = true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..cacdbef99b95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6238,6 +6238,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } #ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { -- cgit v1.2.3 From c4de65696d865c225fda3b9913b31284ea65ea96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 19:05:25 +0200 Subject: cpu/hotplug: Make bringup/teardown of smp threads symmetric The asymmetry caused a warning to trigger if the bootup was stopped in state CPUHP_AP_ONLINE_IDLE. The warning no longer triggers as kthread_park() can now be invoked on already or still parked threads. But there is still no reason to have this be asymmetric. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/cpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..61396b3e9058 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -754,7 +754,6 @@ static int takedown_cpu(unsigned int cpu) /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); - smpboot_park_threads(cpu); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -1332,7 +1331,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", .startup.single = smpboot_unpark_threads, - .teardown.single = NULL, + .teardown.single = smpboot_park_threads, }, [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { .name = "irq/affinity:online", -- cgit v1.2.3 From cc1fe215e1efa406b03aa4389e6269b61342dec5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 17:49:05 +0200 Subject: cpu/hotplug: Split do_cpu_down() Split out the inner workings of do_cpu_down() to allow reuse of that function for the upcoming SMT disabling mechanism. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/cpu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 61396b3e9058..e266cb529c1a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -906,20 +906,19 @@ out: return ret; } +static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) +{ + if (cpu_hotplug_disabled) + return -EBUSY; + return _cpu_down(cpu, 0, target); +} + static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; cpu_maps_update_begin(); - - if (cpu_hotplug_disabled) { - err = -EBUSY; - goto out; - } - - err = _cpu_down(cpu, 0, target); - -out: + err = cpu_down_maps_locked(cpu, target); cpu_maps_update_done(); return err; } -- cgit v1.2.3 From 05736e4ac13c08a4a9b1ef2de26dd31a32cbee57 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 May 2018 17:48:27 +0200 Subject: cpu/hotplug: Provide knobs to control SMT Provide a command line and a sysfs knob to control SMT. The command line options are: 'nosmt': Enumerate secondary threads, but do not online them 'nosmt=force': Ignore secondary threads completely during enumeration via MP table and ACPI/MADT. The sysfs control file has the following states (read/write): 'on': SMT is enabled. Secondary threads can be freely onlined 'off': SMT is disabled. Secondary threads, even if enumerated cannot be onlined 'forceoff': SMT is permanentely disabled. Writes to the control file are rejected. 'notsupported': SMT is not supported by the CPU The command line option 'nosmt' sets the sysfs control to 'off'. This can be changed to 'on' to reenable SMT during runtime. The command line option 'nosmt=force' sets the sysfs control to 'forceoff'. This cannot be changed during runtime. When SMT is 'on' and the control file is changed to 'off' then all online secondary threads are offlined and attempts to online a secondary thread later on are rejected. When SMT is 'off' and the control file is changed to 'on' then secondary threads can be onlined again. The 'off' -> 'on' transition does not automatically online the secondary threads. When the control file is set to 'forceoff', the behaviour is the same as setting it to 'off', but the operation is irreversible and later writes to the control file are rejected. When the control status is 'notsupported' then writes to the control file are rejected. Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Ingo Molnar --- kernel/cpu.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e266cb529c1a..d29fdd7e57bb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -933,6 +933,29 @@ EXPORT_SYMBOL(cpu_down); #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ +#ifdef CONFIG_HOTPLUG_SMT +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_control = CPU_SMT_DISABLED; + if (str && !strcmp(str, "force")) { + pr_info("SMT: Force disabled\n"); + cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } + return 0; +} +early_param("nosmt", smt_cmdline_disable); + +static inline bool cpu_smt_allowed(unsigned int cpu) +{ + return cpu_smt_control == CPU_SMT_ENABLED || + topology_is_primary_thread(cpu); +} +#else +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +#endif + /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started @@ -1056,6 +1079,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } + if (!cpu_smt_allowed(cpu)) { + err = -EPERM; + goto out; + } err = _cpu_up(cpu, 0, target); out: @@ -1904,10 +1931,153 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { NULL }; +#ifdef CONFIG_HOTPLUG_SMT + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); +} + +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +static void cpuhp_smt_enable(void) +{ + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + cpu_maps_update_done(); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int ctrlval, ret; + + if (sysfs_streq(buf, "on")) + ctrlval = CPU_SMT_ENABLED; + else if (sysfs_streq(buf, "off")) + ctrlval = CPU_SMT_DISABLED; + else if (sysfs_streq(buf, "forceoff")) + ctrlval = CPU_SMT_FORCE_DISABLED; + else + return -EINVAL; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) + return -EPERM; + + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return -ENODEV; + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + if (ctrlval != cpu_smt_control) { + switch (ctrlval) { + case CPU_SMT_ENABLED: + cpuhp_smt_enable(); + break; + case CPU_SMT_DISABLED: + case CPU_SMT_FORCE_DISABLED: + ret = cpuhp_smt_disable(ctrlval); + break; + } + } + + unlock_device_hotplug(); + return ret ? ret : count; +} +static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); + +static ssize_t +show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) +{ + bool active = topology_max_smt_threads() > 1; + + return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); +} +static DEVICE_ATTR(active, 0444, show_smt_active, NULL); + +static struct attribute *cpuhp_smt_attrs[] = { + &dev_attr_control.attr, + &dev_attr_active.attr, + NULL +}; + +static const struct attribute_group cpuhp_smt_attr_group = { + .attrs = cpuhp_smt_attrs, + .name = "smt", + NULL +}; + +static int __init cpu_smt_state_init(void) +{ + if (!topology_smt_supported()) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + + return sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_smt_attr_group); +} + +#else +static inline int cpu_smt_state_init(void) { return 0; } +#endif + static int __init cpuhp_sysfs_init(void) { int cpu, ret; + ret = cpu_smt_state_init(); + if (ret) + return ret; + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_cpu_root_attr_group); if (ret) -- cgit v1.2.3 From bfc18e389c7a09fbbbed6bf4032396685b14246e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 21 Jun 2018 13:13:04 +0100 Subject: atomics/treewide: Rename __atomic_add_unless() => atomic_fetch_add_unless() While __atomic_add_unless() was originally intended as a building-block for atomic_add_unless(), it's now used in a number of places around the kernel. It's the only common atomic operation named __atomic*(), rather than atomic_*(), and for consistency it would be better named atomic_fetch_add_unless(). This lack of consistency is slightly confusing, and gets in the way of scripting atomics. Given that, let's clean things up and promote it to an official part of the atomics API, in the form of atomic_fetch_add_unless(). This patch converts definitions and invocations over to the new name, including the instrumented version, using the following script: ---- git grep -w __atomic_add_unless | while read line; do sed -i '{s/\<__atomic_add_unless\>/atomic_fetch_add_unless/}' "${line%%:*}"; done git grep -w __arch_atomic_add_unless | while read line; do sed -i '{s/\<__arch_atomic_add_unless\>/arch_atomic_fetch_add_unless/}' "${line%%:*}"; done ---- Note that we do not have atomic{64,_long}_fetch_add_unless(), which will be introduced by later patches. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Palmer Dabbelt Cc: Boqun Feng Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/20180621121321.4761-2-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..f12db70d3bf3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -575,7 +575,7 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, { int refold; - refold = __atomic_add_unless(&map->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_map_put(map, false); @@ -1142,7 +1142,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_prog_put(prog, false); -- cgit v1.2.3 From 8f894bf47dc9e8b77166125a084a7217693a28cd Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Thu, 31 May 2018 19:11:19 +0800 Subject: sched/debug: Use match_string() helper instead of open-coded logic match_string() returns the index of an array for a matching string, which can be used instead of the open coded variant. Signed-off-by: Yisheng Xie Reviewed-by: Andy Shevchenko Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/1527765086-19873-15-git-send-email-xieyisheng1@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..c96e89cc4bc7 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp) cmp += 3; } - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } + i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp); + if (i < 0) + return i; + + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); } - return i; + return 0; } static ssize_t @@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, { char buf[64]; char *cmp; - int i; + int ret; struct inode *inode; if (cnt > 63) @@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); inode_lock(inode); - i = sched_feat_set(cmp); + ret = sched_feat_set(cmp); inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; + if (ret < 0) + return ret; *ppos += cnt; -- cgit v1.2.3 From 9e3ed2d7597ccbce2a30e036342fb0613d0759e5 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 21 May 2018 23:55:20 +0530 Subject: perf/core: Change perf_mmap_fault() return type to 'vm_fault_t' Use new return type 'vm_fault_t' for fault handlers. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See the following commit: 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: alexander.shishkin@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung@kernel.org Link: https://lkml.kernel.org/lkml/20180521182520.GA19677@jordon-HP-15-Notebook-PC Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 80cca2b30c4f..a4d5ac6d74af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5273,11 +5273,11 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static int perf_mmap_fault(struct vm_fault *vmf) +static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) -- cgit v1.2.3 From f2a3ab36077222437b4826fc76111caa14562b7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:35:01 +0900 Subject: kprobes: Make list and blacklist root user read only Since the blacklist and list files on debugfs indicates a sensitive address information to reader, it should be restricted to the root user. Suggested-by: Thomas Richter Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491890171.9916.5183693615601334087.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 536ab451e96d..898ee56d4f48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2509,7 +2509,7 @@ static int __init debugfs_kprobe_init(void) if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir, NULL, + file = debugfs_create_file("list", 0400, dir, NULL, &debugfs_kprobes_operations); if (!file) goto error; @@ -2519,7 +2519,7 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("blacklist", 0444, dir, NULL, + file = debugfs_create_file("blacklist", 0400, dir, NULL, &debugfs_kprobe_blacklist_ops); if (!file) goto error; -- cgit v1.2.3 From ffb9bd68ebdb3b8d00ef5a79bbe8167a3281cace Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:35:32 +0900 Subject: kprobes: Show blacklist addresses as same as kallsyms does Show kprobes blacklist addresses under same condition of showing kallsyms addresses. Since there are several name conflict for local symbols, kprobe blacklist needs to show each addresses so that user can identify where is on blacklist by comparing with kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491893217.9916.14760965896164273464.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 898ee56d4f48..ab1bfa3d1d9c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2326,8 +2326,16 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, - (void *)ent->end_addr, (void *)ent->start_addr); + /* + * If /proc/kallsyms is not showing kernel address, we won't + * show them here either. + */ + if (!kallsyms_show_value()) + seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, + (void *)ent->start_addr); + else + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); return 0; } -- cgit v1.2.3 From 81365a947de453bcaba2558f8de5dadcffe05bc1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:36:02 +0900 Subject: kprobes: Show address of kprobes if kallsyms does Show probed address in debugfs kprobe list file as same as kallsyms does. This information is used for checking kprobes are placed in the expected address. So it should be able to compared with address in kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491896256.9916.1583733714492565296.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ab1bfa3d1d9c..1d6130d2937a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2226,19 +2226,23 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; + void *addr = p->addr; if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; else kprobe_type = "k"; + if (!kallsyms_show_value()) + addr = NULL; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s ", - p->addr, kprobe_type, sym, offset, + seq_printf(pi, "%px %s %s+0x%x %s ", + addr, kprobe_type, sym, offset, (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p ", - p->addr, kprobe_type, p->addr); + else /* try to use %pS */ + seq_printf(pi, "%px %s %pS ", + addr, kprobe_type, p->addr); if (!pp) pp = p; -- cgit v1.2.3 From 4458515b2c52831ee622411d2fe3e774d1f5c49a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:36:33 +0900 Subject: kprobes: Replace %p with other pointer types Replace %p with %pS or just remove it if unneeded. And use WARN_ONCE() if it is a single bug. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491899284.9916.5350534544808158621.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1d6130d2937a..ab257be4d924 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -710,9 +710,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) * there is still a relative jump) and disabled. */ op = container_of(ap, struct optimized_kprobe, kp); - if (unlikely(list_empty(&op->list))) - printk(KERN_WARNING "Warning: found a stray unused " - "aggrprobe@%p\n", ap->addr); + WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ @@ -985,7 +983,8 @@ static int arm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); if (ret) { - pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } @@ -1025,7 +1024,8 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); - WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", + p->addr, ret); return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ @@ -2069,11 +2069,12 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); +/* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { - printk(KERN_WARNING "Dumping kprobe:\n"); - printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", - kp->symbol_name, kp->addr, kp->offset); + pr_err("Dumping kprobe:\n"); + pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", + kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2096,11 +2097,8 @@ static int __init populate_kprobe_blacklist(unsigned long *start, entry = arch_deref_entry_point((void *)*iter); if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find blacklist at %p\n", - (void *)entry); + !kallsyms_lookup_size_offset(entry, &size, &offset)) continue; - } ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) -- cgit v1.2.3 From 03585a95cd830e7a92697d2a8fe9a34df87563db Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 11 Apr 2018 13:16:56 +0800 Subject: sched/fair: Remove stale tg_unthrottle_up() comments After commit: 82958366cfea ("sched: Replace update_shares weight distribution with per-entity computation") tg_unthrottle_up() did not update the weight. Signed-off-by: Li RongQing Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/lkml/1523423816-18322-1-git-send-email-lirongqing@baidu.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..4cc1441b0746 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4732,7 +4732,6 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } -/* updated child weight may affect parent so we have to do this bottom up */ static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; -- cgit v1.2.3 From 74bdf7815dfb3805a37b0bba615814063a227bf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jun 2018 08:03:32 -0700 Subject: genirq: Speedup show_interrupts() Since commit 425a5072dcd1 ("genirq: Free irq_desc with rcu"), show_interrupts() can be switched to rcu locking, which removes possible contention on sparse_irq_lock. The per_cpu count scan and print can be done without holding desc spinlock. And there is no need to call kstat_irqs_cpu() and abuse irq_to_desc() while holding rcu read lock, since desc and desc->kstat_irqs wont disappear or change. Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Eric Dumazet Link: https://lkml.kernel.org/r/20180620150332.163320-1-edumazet@google.com --- kernel/irq/proc.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 37eda10f5c36..da9addb8d655 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -475,22 +475,24 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(i); if (!desc) goto outsparse; - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if ((!action || irq_desc_is_chained(desc)) && !any_count) - goto out; + if (desc->kstat_irqs) + for_each_online_cpu(j) + any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + + if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) + goto outsparse; seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, "%10u ", desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, j) : 0); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -511,6 +513,7 @@ int show_interrupts(struct seq_file *p, void *v) if (desc->name) seq_printf(p, "-%-8s", desc->name); + action = desc->action; if (action) { seq_printf(p, " %s", action->name); while ((action = action->next) != NULL) @@ -518,10 +521,9 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); -out: raw_spin_unlock_irqrestore(&desc->lock, flags); outsparse: - irq_unlock_sparse(); + rcu_read_unlock(); return 0; } #endif -- cgit v1.2.3 From 836557bd58e5e65c05c73af9f6ebed885dbfccfc Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 24 Jun 2018 10:35:18 +0200 Subject: genirq: Update code comments wrt recycled thread_mask Previously a race existed between __free_irq() and __setup_irq() wherein the thread_mask of a just removed action could be handed out to a newly added action and the freed irq thread would then tread on the oneshot mask bit of the newly added irq thread in irq_finalize_oneshot(): time | __free_irq() | raw_spin_lock_irqsave(&desc->lock, flags); | | raw_spin_unlock_irqrestore(&desc->lock, flags); | | __setup_irq() | raw_spin_lock_irqsave(&desc->lock, flags); | | raw_spin_unlock_irqrestore(&desc->lock, flags); | | irq_thread() of freed irq (__free_irq() waits in synchronize_irq()) | irq_thread_fn() | irq_finalize_oneshot() | raw_spin_lock_irq(&desc->lock); | desc->threads_oneshot &= ~action->thread_mask; | raw_spin_unlock_irq(&desc->lock); v The race was known at least since 2012 when it was documented in a code comment by commit e04268b0effc ("genirq: Remove paranoid warnons and bogus fixups"). The race itself is harmless as nothing touches any of the potentially freed data after synchronize_irq(). In 2017 the race was close by commit 9114014cf4e6 ("genirq: Add mutex to irq desc to serialize request/free_irq()"), apparently inadvertantly so because the race is neither mentioned in the commit message nor was the code comment updated. Make up for that. Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Mika Westerberg Cc: linux-pci@vger.kernel.org Link: https://lkml.kernel.org/r/32fc25aa35ecef4b2692f57687bb7fc2a57230e2.1529828292.git.lukas@wunner.de --- kernel/irq/manage.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 591cfe901162..123a227d3357 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1025,10 +1025,7 @@ static int irq_thread(void *data) * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the - * oneshot mask bit can be set. We cannot verify that as we - * cannot touch the oneshot mask at this point anymore as - * __setup_irq() might have given out currents thread_mask - * again. + * oneshot mask bit can be set. */ task_work_cancel(current, irq_thread_dtor); return 0; @@ -1245,7 +1242,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_irq() to complete without holding the optional - * chip bus lock and desc->lock. + * chip bus lock and desc->lock. Also protects against handing out + * a recycled oneshot thread_mask bit while it's still in use by + * its previous owner. */ mutex_lock(&desc->request_mutex); -- cgit v1.2.3 From 519cc8652b3a1d3d0a02257afbe9573ad644da26 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 24 Jun 2018 10:35:30 +0200 Subject: genirq: Synchronize only with single thread on free_irq() When pciehp is converted to threaded IRQ handling, removal of unplugged devices below a PCIe hotplug port happens synchronously in the IRQ thread. Removal of devices typically entails a call to free_irq() by their drivers. If those devices share their IRQ with the hotplug port, __free_irq() deadlocks because it calls synchronize_irq() to wait for all hard IRQ handlers as well as all threads sharing the IRQ to finish. Actually it's sufficient to wait only for the IRQ thread of the removed device, so call synchronize_hardirq() to wait for all hard IRQ handlers to finish, but no longer for any threads. Compensate by rearranging the control flow in irq_wait_for_interrupt() such that the device's thread is allowed to run one last time after kthread_stop() has been called. kthread_stop() blocks until the IRQ thread has completed. On completion the IRQ thread clears its oneshot thread_mask bit. This is safe because __free_irq() holds the request_mutex, thereby preventing __setup_irq() from handing out the same oneshot thread_mask bit to a newly requested action. Stack trace for posterity: INFO: task irq/17-pciehp:94 blocked for more than 120 seconds. schedule+0x28/0x80 synchronize_irq+0x6e/0xa0 __free_irq+0x15a/0x2b0 free_irq+0x33/0x70 pciehp_release_ctrl+0x98/0xb0 pcie_port_remove_service+0x2f/0x40 device_release_driver_internal+0x157/0x220 bus_remove_device+0xe2/0x150 device_del+0x124/0x340 device_unregister+0x16/0x60 remove_iter+0x1a/0x20 device_for_each_child+0x4b/0x90 pcie_port_device_remove+0x1e/0x30 pci_device_remove+0x36/0xb0 device_release_driver_internal+0x157/0x220 pci_stop_bus_device+0x7d/0xa0 pci_stop_bus_device+0x3d/0xa0 pci_stop_and_remove_bus_device+0xe/0x20 pciehp_unconfigure_device+0xb8/0x160 pciehp_disable_slot+0x84/0x130 pciehp_ist+0x158/0x190 irq_thread_fn+0x1b/0x50 irq_thread+0x143/0x1a0 kthread+0x111/0x130 Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Cc: Bjorn Helgaas Cc: Mika Westerberg Cc: linux-pci@vger.kernel.org Link: https://lkml.kernel.org/r/d72b41309f077c8d3bee6cc08ad3662d50b5d22a.1529828292.git.lukas@wunner.de --- kernel/irq/manage.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 123a227d3357..1f8be33572a7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -790,9 +790,19 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static int irq_wait_for_interrupt(struct irqaction *action) { - set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + if (kthread_should_stop()) { + /* may need to run one last time */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { + __set_current_state(TASK_RUNNING); + return 0; + } + __set_current_state(TASK_RUNNING); + return -1; + } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { @@ -800,10 +810,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) return 0; } schedule(); - set_current_state(TASK_INTERRUPTIBLE); } - __set_current_state(TASK_RUNNING); - return -1; } /* @@ -1024,7 +1031,7 @@ static int irq_thread(void *data) /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling - * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the + * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel(current, irq_thread_dtor); @@ -1241,7 +1248,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * Protects against a concurrent __free_irq() call which might wait - * for synchronize_irq() to complete without holding the optional + * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. @@ -1612,11 +1619,11 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang - * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock - * because synchronize_irq() would wait forever for the thread to + * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a @@ -1628,7 +1635,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU: */ - synchronize_irq(irq); + synchronize_hardirq(irq); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -1646,6 +1653,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } #endif + /* + * The action has already been removed above, but the thread writes + * its oneshot mask bit when it completes. Though request_mutex is + * held across this which prevents __setup_irq() from handing out + * the same bit to a newly requested action. + */ if (action->thread) { kthread_stop(action->thread); put_task_struct(action->thread); -- cgit v1.2.3 From d0dd63a8aee1ef89f2e48e554b796b9f9e4fcadb Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 16 Jun 2018 22:11:42 -0700 Subject: time: Introduce struct __kernel_itimerspec struct itimerspec is not y2038-safe. Introduce a new struct __kernel_itimerspec based on the kernel internal y2038-safe struct itimerspec64. The definition of struct __kernel_itimerspec includes two struct __kernel_timespec. Since struct __kernel_timespec has the same representation in native and compat modes, so does struct __kernel_itimerspec. This helps have a common entry point for syscalls using struct __kernel_itimerspec. New y2038-safe syscalls will use this new type. Since most of the new syscalls are just an update to the native syscalls with the type update, place the new definition under CONFIG_64BIT_TIME. This helps architectures that do not support the above config to keep using the old definition of struct itimerspec. Also change the get/put_itimerspec64 to use struct__kernel_itimerspec. This will help 32 bit architectures to use the new syscalls when architectures select CONFIG_64BIT_TIME. Signed-off-by: Deepa Dinamani Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: viro@zeniv.linux.org.uk Cc: linux-fsdevel@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: y2038@lists.linaro.org Link: https://lkml.kernel.org/r/20180617051144.29756-2-deepa.kernel@gmail.com --- kernel/time/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index b1225db61eb2..c0195225fdce 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -927,7 +927,7 @@ int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) EXPORT_SYMBOL_GPL(compat_put_timespec64); int get_itimerspec64(struct itimerspec64 *it, - const struct itimerspec __user *uit) + const struct __kernel_itimerspec __user *uit) { int ret; @@ -942,7 +942,7 @@ int get_itimerspec64(struct itimerspec64 *it, EXPORT_SYMBOL_GPL(get_itimerspec64); int put_itimerspec64(const struct itimerspec64 *it, - struct itimerspec __user *uit) + struct __kernel_itimerspec __user *uit) { int ret; -- cgit v1.2.3 From afef05cf238cfcecdc5ea9b06f31027b13ce6214 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 16 Jun 2018 22:11:43 -0700 Subject: time: Enable get/put_compat_itimerspec64 always This will aid in enabling the compat syscalls on 32-bit architectures later on. Also move compat_itimerspec and related defines to compat_time.h. The compat_time.h file will eventually be deleted. Signed-off-by: Deepa Dinamani Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: viro@zeniv.linux.org.uk Cc: linux-fsdevel@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: y2038@lists.linaro.org Link: https://lkml.kernel.org/r/20180617051144.29756-3-deepa.kernel@gmail.com --- kernel/compat.c | 29 ----------------------------- kernel/time/time.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 702aa846ddac..8e40efc2928a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -324,35 +324,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } -/* Todo: Delete these extern declarations when get/put_compat_itimerspec64() - * are moved to kernel/time/time.c . - */ -extern int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts); -extern int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts); - -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) -{ - - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); - -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) -{ - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/time.c b/kernel/time/time.c index c0195225fdce..72caf051901c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -955,3 +955,24 @@ int put_itimerspec64(const struct itimerspec64 *it, return ret; } EXPORT_SYMBOL_GPL(put_itimerspec64); + +int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits) +{ + + if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || + __compat_get_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_compat_itimerspec64); + +int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits) +{ + if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || + __compat_put_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_compat_itimerspec64); -- cgit v1.2.3 From 6ff84735070276d72af716e21c3214ee20d60e70 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 16 Jun 2018 22:11:44 -0700 Subject: time: Change types to new y2038 safe __kernel_itimerspec timer_set/gettime and timerfd_set/get apis use struct itimerspec at the user interface layer. struct itimerspec is not y2038-safe. Change these interfaces to use y2038-safe struct __kernel_itimerspec instead. This will help define new syscalls when 32bit architectures select CONFIG_64BIT_TIME. Signed-off-by: Deepa Dinamani Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: viro@zeniv.linux.org.uk Cc: linux-fsdevel@vger.kernel.org Cc: linux-api@vger.kernel.org Cc: y2038@lists.linaro.org Link: https://lkml.kernel.org/r/20180617051144.29756-4-deepa.kernel@gmail.com --- kernel/time/posix-timers.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index fcf90a10c43a..80d59333c76e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -743,7 +743,7 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) + struct __kernel_itimerspec __user *, setting) { struct itimerspec64 cur_setting; @@ -755,7 +755,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME + COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { @@ -768,6 +769,7 @@ COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, } return ret; } + #endif /* @@ -906,8 +908,8 @@ retry: /* Set a POSIX.1b interval timer */ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) + const struct __kernel_itimerspec __user *, new_setting, + struct __kernel_itimerspec __user *, old_setting) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; @@ -927,7 +929,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, return error; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct compat_itimerspec __user *, new, struct compat_itimerspec __user *, old) -- cgit v1.2.3 From faef87723ace12e5f685437c1e68cbecb69a7a89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Jun 2018 13:08:51 +0200 Subject: dma-noncoherent: add a arch_sync_dma_for_cpu_all hook The MIPS bmips platform needs a global flush when transferring ownership back to the CPU. Add a hook for that to the dma-noncoherent implementation. Signed-off-by: Christoph Hellwig Patchwork: https://patchwork.linux-mips.org/patch/19549/ Signed-off-by: Paul Burton Cc: Florian Fainelli Cc: David Daney Cc: Kevin Cernekee Cc: Jiaxun Yang Cc: Tom Bogendoerfer Cc: Huacai Chen Cc: iommu@lists.linux-foundation.org Cc: linux-mips@linux-mips.org --- kernel/dma/noncoherent.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c index 79e9a757387f..031fe235d958 100644 --- a/kernel/dma/noncoherent.c +++ b/kernel/dma/noncoherent.c @@ -49,11 +49,13 @@ static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, return nents; } -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) static void dma_noncoherent_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, @@ -64,6 +66,7 @@ static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, for_each_sg(sgl, sg, nents, i) arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu_all(dev); } static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, @@ -89,7 +92,8 @@ const struct dma_map_ops dma_noncoherent_ops = { .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, .map_page = dma_noncoherent_map_page, .map_sg = dma_noncoherent_map_sg, -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, .unmap_page = dma_noncoherent_unmap_page, -- cgit v1.2.3 From 5257514d8885b2ebc11bf359ea1527282b47a5fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 11:03:39 -0700 Subject: rcu: Make expedited grace period use direct call on last leaf During expedited grace-period initialization, a work item is scheduled for each leaf rcu_node structure. However, that initialization code is itself (normally) executing from a workqueue, so one of the leaf rcu_node structures could just as well be handled by that pre-existing workqueue, and with less overhead. This commit therefore uses a shiny new rcu_is_leaf_node() macro to execute the last leaf rcu_node structure's initialization directly from the pre-existing workqueue. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 3 +++ kernel/rcu/tree_exp.h | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 40cea6735c2d..db0870acfdff 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -276,6 +276,9 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) +/* Is this rcu_node the last leaf? */ +#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d40708e8c5d6..c6385ee1af65 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -486,8 +486,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rnp->rew.rew_func = func; rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || - rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - /* No workqueues yet. */ + rcu_scheduler_active != RCU_SCHEDULER_RUNNING || + rcu_is_last_leaf_node(rsp, rnp)) { + /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } -- cgit v1.2.3 From 5ef98a6328a1506f544a64b28d6a8a7b99af475b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Apr 2018 21:30:13 -0700 Subject: srcu: Fix typos in __call_srcu() header comment This commit simply changes some copy-pasta call_rcu() instances to the correct call_srcu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b4123d7a2cec..615e414b0c1e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -823,17 +823,17 @@ static void srcu_leak_callback(struct rcu_head *rhp) * more than one CPU, this means that when "func()" is invoked, each CPU * is guaranteed to have executed a full memory barrier since the end of * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing + * preceded the call to call_srcu(). It also means that each CPU executing * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() + * "func()" must have executed a memory barrier after the call_srcu() * but before the beginning of that SRCU read-side critical section. * Note that these guarantees include CPUs that are offline, idle, or * executing in user mode, as well as CPUs that are executing in the kernel. * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the * resulting SRCU callback function "func()", then both CPU A and CPU * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". + * interval between the call to call_srcu() and the invocation of "func()". * This guarantee applies even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). * -- cgit v1.2.3 From 17294ce6a41d3fee6a9bfc52387c107a4607c1c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Apr 2018 12:03:36 -0700 Subject: srcu: Document that srcu_funnel_gp_start() implies srcu_funnel_exp_start() This commit updates the header comment of srcu_funnel_gp_start() to document the fact that srcu_funnel_gp_start() does the work of srcu_funnel_exp_start(), in some cases by invoking it directly. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 615e414b0c1e..37bef2fe80e6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -641,6 +641,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, * period s. Losers must either ensure that their desired grace-period * number is recorded on at least their leaf srcu_node structure, or they * must take steps to invoke their own callbacks. + * + * Note that this function also does the work of srcu_funnel_exp_start(), + * in some cases by directly invoking it. */ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, unsigned long s, bool do_norm) -- cgit v1.2.3 From 5ab07a8df4d6c958ca63640d3f2ef896f0679c05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 12:28:04 -0700 Subject: srcu: Add address of first callback to rcutorture output This commit adds the address of the first callback to the per-CPU rcutorture output in order to allow lost wakeups to be more efficiently tracked down. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 37bef2fe80e6..5a1a9a07b407 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1271,11 +1271,11 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_data *counts; + struct srcu_data *sdp; - counts = per_cpu_ptr(sp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; + sdp = per_cpu_ptr(sp->sda, cpu); + u0 = sdp->srcu_unlock_count[!idx]; + u1 = sdp->srcu_unlock_count[idx]; /* * Make sure that a lock is always counted if the corresponding @@ -1283,12 +1283,13 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) */ smp_rmb(); - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; + l0 = sdp->srcu_lock_count[!idx]; + l1 = sdp->srcu_lock_count[idx]; c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld %1p)", + cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); s0 += c0; s1 += c1; } -- cgit v1.2.3 From 90127d605f403d814f4986436871210bf8ceb335 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2018 10:29:18 -0700 Subject: torture: Make online/offline messages appear only for verbose=2 Some bugs reproduce quickly only at high CPU-hotplug rates, so the rcutorture TREE03 scenario now has only 200 milliseconds spacing between CPU-hotplug operations. At this rate, the torture-test pair of console messages per operation becomes a bit voluminous. This commit therefore converts the torture-test set of "verbose" kernel-boot arguments from bool to int, and prints the extra console messages only when verbose=2. The default is still verbose=1. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcuperf.c | 2 +- kernel/rcu/rcutorture.c | 2 +- kernel/torture.c | 12 ++++++------ 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..4a2e13870a9b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -57,7 +57,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e232846516b3..fb8094848906 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,7 +88,7 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); -torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 42fcb7f05fac..a5540bd831c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -101,7 +101,7 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, true, +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; diff --git a/kernel/torture.c b/kernel/torture.c index 3de1efbecd6a..840fd33c1cda 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -53,7 +53,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); static char *torture_type; -static bool verbose; +static int verbose; /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -98,7 +98,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlining %d\n", torture_type, cpu); @@ -111,7 +111,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, "torture_onoff task: offline %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); @@ -147,7 +147,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlining %d\n", torture_type, cpu); @@ -160,7 +160,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, "torture_onoff task: online %d failed: errno %d\n", torture_type, cpu, ret); } else { - if (verbose) + if (verbose > 1) pr_alert("%s" TORTURE_FLAG "torture_onoff task: onlined %d\n", torture_type, cpu); @@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v) +bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { -- cgit v1.2.3 From 60500037637397dcc8ea3d3c2f16e05ea6695a86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 12:25:05 -0700 Subject: torture: Keep old-school dmesg format This commit adds "#define pr_fmt(fmt) fmt" to the torture-test files in order to keep the current dmesg format. Once Joe's commits have hit mainline, these definitions will be changed in order to automatically generate the dmesg line prefix that the scripts expect. This will have the beneficial side-effect of allowing printk() formats to be used more widely and of shortening some pr_*() lines. Signed-off-by: Paul E. McKenney Cc: Joe Perches --- kernel/locking/locktorture.c | 3 +++ kernel/rcu/rcuperf.c | 3 +++ kernel/rcu/rcutorture.c | 3 +++ kernel/torture.c | 3 +++ 4 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 4a2e13870a9b..57bef4fbfb31 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -21,6 +21,9 @@ * Davidlohr Bueso * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index fb8094848906..df29119b2013 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -19,6 +19,9 @@ * * Authors: Paul E. McKenney */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a5540bd831c4..5604bfac8df4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -22,6 +22,9 @@ * * See also: Documentation/RCU/torture.txt */ + +#define pr_fmt(fmt) fmt + #include #include #include diff --git a/kernel/torture.c b/kernel/torture.c index 840fd33c1cda..1ac24a826589 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -20,6 +20,9 @@ * Author: Paul E. McKenney * Based on kernel/rcu/torture.c. */ + +#define pr_fmt(fmt) fmt + #include #include #include -- cgit v1.2.3 From 9a4903dde2c8633c5fcf887b98c4e047a6154a54 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:48 +0200 Subject: perf/hw_breakpoint: Split attribute parse and commit arch_validate_hwbkpt_settings() mixes up attribute check and commit into a single code entity. Therefore the validation may return an error due to incorrect atributes while still leaving halfway modified architecture breakpoint data. This is harmless when we deal with a new breakpoint but it becomes a problem when we modify an existing breakpoint. Split attribute parse and commit to fix that. The architecture is passed a "struct arch_hw_breakpoint" to fill on top of the new attr and the core takes care about copying the backend data once it's fully validated. The architectures then need to implement the new API. Original-patch-by: Andy Lutomirski Reported-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 57 +++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6e28d2866be5..314e2a9040c7 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -400,16 +400,35 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } -static int validate_hw_breakpoint(struct perf_event *bp) +#ifndef hw_breakpoint_arch_parse +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - int ret; + int err; - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; + err = arch_validate_hwbkpt_settings(bp); + if (err) + return err; + + *hw = bp->hw.info; + + return 0; +} +#endif + +static int hw_breakpoint_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + int err; + + err = hw_breakpoint_arch_parse(bp, attr, hw); + if (err) + return err; if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) + if (attr->exclude_kernel) return -EINVAL; /* * Don't let unprivileged users set a breakpoint in the trap @@ -424,19 +443,22 @@ static int validate_hw_breakpoint(struct perf_event *bp) int register_perf_hw_breakpoint(struct perf_event *bp) { - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; + struct arch_hw_breakpoint hw; + int err; - ret = validate_hw_breakpoint(bp); + err = reserve_bp_slot(bp); + if (err) + return err; - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) + err = hw_breakpoint_parse(bp, &bp->attr, &hw); + if (err) { release_bp_slot(bp); + return err; + } - return ret; + bp->hw.info = hw; + + return 0; } /** @@ -464,6 +486,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; bool modify = attr->bp_type != old_type; + struct arch_hw_breakpoint hw; int err = 0; bp->attr.bp_addr = attr->bp_addr; @@ -473,7 +496,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a if (check && memcmp(&bp->attr, attr, sizeof(*attr))) return -EINVAL; - err = validate_hw_breakpoint(bp); + err = hw_breakpoint_parse(bp, attr, &hw); if (!err && modify) err = modify_bp_slot(bp, old_type); @@ -484,7 +507,9 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a return err; } + bp->hw.info = hw; bp->attr.disabled = attr->disabled; + return 0; } -- cgit v1.2.3 From 8e983ff9ac02a8fb454ed09c2462bdb3617006a8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:49 +0200 Subject: perf/hw_breakpoint: Pass arch breakpoint struct to arch_check_bp_in_kernelspace() We can't pass the breakpoint directly on arch_check_bp_in_kernelspace() anymore because its architecture internal datas (struct arch_hw_breakpoint) are not yet filled by the time we call the function, and most implementation need this backend to be up to date. So arrange the function to take the probing struct instead. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-3-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 314e2a9040c7..89fe94c9214c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -427,7 +427,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, if (err) return err; - if (arch_check_bp_in_kernelspace(bp)) { + if (arch_check_bp_in_kernelspace(hw)) { if (attr->exclude_kernel) return -EINVAL; /* -- cgit v1.2.3 From cffbb3bd444bc95186b6ab0ed3bf1d5bcf4aa620 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:57 +0200 Subject: perf/hw_breakpoint: Remove default hw_breakpoint_arch_parse() All architectures have implemented it, we can now remove the poor weak version. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-11-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 89fe94c9214c..e7bc8d0a5972 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -400,23 +400,6 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } -#ifndef hw_breakpoint_arch_parse -int hw_breakpoint_arch_parse(struct perf_event *bp, - const struct perf_event_attr *attr, - struct arch_hw_breakpoint *hw) -{ - int err; - - err = arch_validate_hwbkpt_settings(bp); - if (err) - return err; - - *hw = bp->hw.info; - - return 0; -} -#endif - static int hw_breakpoint_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) -- cgit v1.2.3 From cb8b78815b68b048303f55c276d2aef147e8f2e7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:58 +0200 Subject: perf/hw_breakpoint: Pass new breakpoint type to modify_breakpoint_slot() We soon won't be able to rely on bp->attr anymore to get the new type of the modifying breakpoint because the new attributes are going to be copied only once we successfully modified the breakpoint slot. This will fix the current misdesigned layout where the new attr are copied to the modifying breakpoint before we actually know if the modification will be validated. In order to prepare for that, allow modify_breakpoint_slot() to take the new breakpoint type. Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-12-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index e7bc8d0a5972..71387703b5f1 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -345,13 +345,13 @@ void release_bp_slot(struct perf_event *bp) mutex_unlock(&nr_bp_mutex); } -static int __modify_bp_slot(struct perf_event *bp, u64 old_type) +static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int err; __release_bp_slot(bp, old_type); - err = __reserve_bp_slot(bp, bp->attr.bp_type); + err = __reserve_bp_slot(bp, new_type); if (err) { /* * Reserve the old_type slot back in case @@ -367,12 +367,12 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type) return err; } -static int modify_bp_slot(struct perf_event *bp, u64 old_type) +static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { int ret; mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type); + ret = __modify_bp_slot(bp, old_type, new_type); mutex_unlock(&nr_bp_mutex); return ret; } @@ -481,7 +481,7 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a err = hw_breakpoint_parse(bp, attr, &hw); if (!err && modify) - err = modify_bp_slot(bp, old_type); + err = modify_bp_slot(bp, old_type, bp->attr.bp_type); if (err) { bp->attr.bp_addr = old_addr; -- cgit v1.2.3 From 26c6ccdf5c06aa83bb78518c72cb287f043db3df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Jun 2018 04:58:59 +0200 Subject: perf/hw_breakpoint: Clean up and consolidate modify_user_hw_breakpoint_check() Remove the dance around old and new attributes. Just don't modify the previous breakpoint at all until we have verified everything. Original-patch-by: Andy Lutomirski Reported-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chris Zankel Cc: Jiri Olsa Cc: Joel Fernandes Cc: Mark Rutland Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/1529981939-8231-13-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 71387703b5f1..b3814fce5ecb 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -461,37 +461,43 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); +static void hw_breakpoint_copy_attr(struct perf_event_attr *to, + struct perf_event_attr *from) +{ + to->bp_addr = from->bp_addr; + to->bp_type = from->bp_type; + to->bp_len = from->bp_len; + to->disabled = from->disabled; +} + int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - bool modify = attr->bp_type != old_type; struct arch_hw_breakpoint hw; - int err = 0; + int err; - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; + err = hw_breakpoint_parse(bp, attr, &hw); + if (err) + return err; - if (check && memcmp(&bp->attr, attr, sizeof(*attr))) - return -EINVAL; + if (check) { + struct perf_event_attr old_attr; - err = hw_breakpoint_parse(bp, attr, &hw); - if (!err && modify) - err = modify_bp_slot(bp, old_type, bp->attr.bp_type); + old_attr = bp->attr; + hw_breakpoint_copy_attr(&old_attr, attr); + if (memcmp(&old_attr, attr, sizeof(*attr))) + return -EINVAL; + } - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - return err; + if (bp->attr.bp_type != attr->bp_type) { + err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type); + if (err) + return err; } + hw_breakpoint_copy_attr(&bp->attr, attr); bp->hw.info = hw; - bp->attr.disabled = attr->disabled; return 0; } -- cgit v1.2.3 From 4bc8d55574dd316e43975651b9259c5c18d741fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Nov 2017 15:13:56 -0800 Subject: rcu: Add debugging info to assertion The WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp()) in rcu_gp_cleanup() triggers (inexplicably, of course) every so often. This commit therefore extracts more information. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aa7cade1b9f3..79c7fe978b17 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2084,7 +2084,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); rdp = this_cpu_ptr(rsp->rda); @@ -2294,6 +2295,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + rnp->completedqs = rnp->gpnum; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3930,6 +3932,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 78e051dffc5b..7365ac53fdd9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ /* In leaf rcu_node, each bit corresponds to */ @@ -453,6 +454,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fd12039e512..17b67ecf7dff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,8 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * ->exp_tasks pointers, respectively, to reference the newly * blocked tasks. */ - if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != @@ -535,6 +537,8 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); + WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -697,7 +701,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) + dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, @@ -841,6 +846,27 @@ void exit_rcu(void) __rcu_read_unlock(); } +/* + * Dump the blocked-tasks state, but limit the list dump to the + * specified number of elements. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + int i; + struct list_head *lhp; + + lockdep_assert_held(&rnp->lock); + pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); + pr_cont("\t->blkd_tasks"); + i = 0; + list_for_each(lhp, &rnp->blkd_tasks) { + pr_cont(" %p", lhp); + if (++i >= 10) + break; + } + pr_cont("\n"); +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; @@ -949,6 +975,14 @@ void exit_rcu(void) { } +/* + * Dump the guaranteed-empty blocked-tasks state. Trust but verify. + */ +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +{ + WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST -- cgit v1.2.3 From ce11fae8d43fe9a36823fbbfe7c44de775b7e346 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 9 Mar 2018 09:32:18 +0800 Subject: rcu: Use the proper lockdep annotation in dump_blkd_tasks() Sparse reported this: | kernel/rcu/tree_plugin.h:814:9: warning: incorrect type in argument 1 (different modifiers) | kernel/rcu/tree_plugin.h:814:9: expected struct lockdep_map const *lock | kernel/rcu/tree_plugin.h:814:9: got struct lockdep_map [noderef] * This is caused by using vanilla lockdep annotations on rcu_node::lock, and that requires accessing ->lock of rcu_node directly. However we need to keep rcu_node::lock __private to avoid breaking its extra ordering guarantee. And we have a dedicated lockdep annotation for rcu_node::lock, so use it. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 17b67ecf7dff..e387ea712758 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -855,7 +855,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int i; struct list_head *lhp; - lockdep_assert_held(&rnp->lock); + raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); pr_cont("\t->blkd_tasks"); i = 0; -- cgit v1.2.3 From 8c42b1f39fdf9fde7cfc4024397255f31a860db6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Apr 2018 11:04:46 -0700 Subject: rcu: Exclude near-simultaneous RCU CPU stall warnings There is a two-jiffy delay between the time that a CPU will self-report an RCU CPU stall warning and the time that some other CPU will report a warning on behalf of the first CPU. This has worked well in the past, but on busy systems, it is possible for the two warnings to overlap, which makes interpreting them extremely difficult. This commit therefore uses a cmpxchg-based timing decision that allows only one report in a given one-minute period (assuming default stall-warning Kconfig parameters). This approach will of course fail if you are seeing minute-long vCPU preemption, but in that case the overlapping RCU CPU stall warnings are the least of your worries. Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79c7fe978b17..b1fffa21b9e4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1368,7 +1368,6 @@ static inline void panic_on_rcu_stall(void) static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; - long delta; unsigned long flags; unsigned long gpa; unsigned long j; @@ -1381,18 +1380,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) if (rcu_cpu_stall_suppress) return; - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - delta = jiffies - READ_ONCE(rsp->jiffies_stall); - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - WRITE_ONCE(rsp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1441,6 +1428,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) sched_show_task(current); } } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) + WRITE_ONCE(rsp->jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); rcu_check_gp_kthread_starvation(rsp); @@ -1485,6 +1476,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1508,6 +1500,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long gpnum; unsigned long gps; unsigned long j; + unsigned long jn; unsigned long js; struct rcu_node *rnp; @@ -1546,14 +1539,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; if (rcu_gp_in_progress(rsp) && - (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp, gpnum); -- cgit v1.2.3 From 375899cddcbb26881b03cb3fbdcfd600e4e67f4a Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 1 Jun 2018 14:26:42 +0530 Subject: printk: make sure to print log on console. This patch make sure printing of log on console if loglevel at time of storing log is less than current console loglevel. @why In SMP printk can work asynchronously, logs can be missed on console because it checks current log level at time of console_unlock, not at time of storing logs. func() { .... .... console_verbose(); // user wants to have all the logs on console. pr_alert(); dump_backtrace(); //prints with default loglevel. ... console_silent(); // stop all logs from printing on console. } Now if console_lock was owned by another process, the messages might be handled after the consoles were silenced. Reused flag LOG_NOCONS as its usage is gone long back by the commit 5c2992ee7fd8a29d0412 ("printk: remove console flushing special cases for partial buffered lines"). Note that there are still some corner cases where this patch is not enough. For example, when the messages are flushed later from printk_safe buffers or when there are races between console_verbose() and console_silent() callers. Link: http://lkml.kernel.org/r/20180601090029epcas5p3cc93d4bfbebb3199f0a2684058da7e26~z-a_jkmrI2993329933epcas5p3q@epcas5p3.samsung.com Cc: linux-kernel@vger.kernel.org Cc: a.sahrawat@samsung.com Cc: pankaj.m@samsung.com Cc: v.narang@samsung.com Cc: Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..3999c295d6f7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -349,7 +349,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1886,6 +1886,9 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; + if (suppress_message_printing(level)) + lflags |= LOG_NOCONS; + printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_unlock_irqrestore(flags); @@ -2349,11 +2352,10 @@ skip: break; msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (msg->flags & LOG_NOCONS) { /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. + * Skip record if !ignore_loglevel, and + * record has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; -- cgit v1.2.3 From 63842c21347ecf4685d6ec008e667e61b0ead6e5 Mon Sep 17 00:00:00 2001 From: Namit Gupta Date: Wed, 20 Jun 2018 19:26:19 +0530 Subject: printk: Remove unnecessary kmalloc() from syslog during clear When the request is only for clearing logs, there is no need for allocation/deallocation. Only the indexes need to be reset and returned. Rest of the patch is mostly made up of changes because of indention. Link: http://lkml.kernel.org/r/20180620135951epcas5p3bd2a8f25ec689ca333bce861b527dba2~54wyKcT0_3155531555epcas5p3y@epcas5p3.samsung.com Cc: linux-kernel@vger.kernel.org Cc: pankaj.m@samsung.com Cc: a.sahrawat@samsung.com Signed-off-by: Namit Gupta Signed-off-by: Himanshu Maithani Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 109 ++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3999c295d6f7..16b02cc51a14 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1352,71 +1352,78 @@ static int syslog_print_all(char __user *buf, int size, bool clear) { char *text; int len = 0; + u64 next_seq; + u64 seq; + u32 idx; + + if (!buf) { + if (clear) { + logbuf_lock_irq(); + clear_seq = log_next_seq; + clear_idx = log_next_idx; + logbuf_unlock_irq(); + } + return 0; + } text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; logbuf_lock_irq(); - if (buf) { - u64 next_seq; - u64 seq; - u32 idx; + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + while (seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - /* - * Find first record that fits, including all following records, - * into the user-provided buffer for this dump. - */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); + /* move first record forward until length fits into the buffer */ + seq = clear_seq; + idx = clear_idx; + while (len > size && seq < log_next_seq) { + struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); - idx = log_next(idx); - seq++; - } + len -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } - /* last message fitting into this dump */ - next_seq = log_next_seq; + /* last message fitting into this dump */ + next_seq = log_next_seq; - len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen; + len = 0; + while (len >= 0 && seq < next_seq) { + struct printk_log *msg = log_from_idx(idx); + int textlen; - textlen = msg_print_text(msg, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } - idx = log_next(idx); - seq++; + textlen = msg_print_text(msg, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; - logbuf_unlock_irq(); - if (copy_to_user(buf + len, text, textlen)) - len = -EFAULT; - else - len += textlen; - logbuf_lock_irq(); + logbuf_unlock_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; } } -- cgit v1.2.3 From d48de54a9dab5370edd2e991f78cc7996cf5483e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jun 2018 15:20:27 +0200 Subject: printk: Export is_console_locked This is a preparation patch for adding a number of WARN_CONSOLE_UNLOCKED() calls to the fbcon code, which may be built as a module (event though usually it is not). Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Reviewed-by: Daniel Vetter Signed-off-by: Hans de Goede Signed-off-by: Bartlomiej Zolnierkiewicz --- kernel/printk/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..3f041e7cbfc9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2243,6 +2243,7 @@ int is_console_locked(void) { return console_locked; } +EXPORT_SYMBOL(is_console_locked); /* * Check if we have any console that is capable of printing while cpu is -- cgit v1.2.3 From 65a8766f5f50a5cd342f3d8f77a30917648ddb67 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 14 Jun 2018 16:20:05 -0400 Subject: audit: check audit_enabled in audit_tree_log_remove_rule() Respect the audit_enabled flag when printing tree rule config change records. See: https://github.com/linux-audit/audit-kernel/issues/50 Signed-off-by: Richard Guy Briggs [PM: tweak the subject line] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index c99ebaae5abc..9f6eaeb6919f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -497,6 +497,8 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; + if (!audit_enabled) + return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; -- cgit v1.2.3 From 4fa7f086993594e79f456a04656da2fcb5691209 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 14 Jun 2018 16:20:06 -0400 Subject: audit: simplify audit_enabled check in audit_watch_log_rule_change() Check the audit_enabled flag and bail immediately. This does not change the functionality, but brings the code format in line with similar checks in audit_tree_log_remove_rule(), audit_mark_log_rule_change(), and elsewhere in the audit code. See: https://github.com/linux-audit/audit-kernel/issues/50 Signed-off-by: Richard Guy Briggs [PM: tweaked subject line] Signed-off-by: Paul Moore --- kernel/audit_watch.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index c17c0c268436..6f249bdf2d84 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -238,20 +238,21 @@ out: static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) { - if (audit_enabled) { - struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); - if (unlikely(!ab)) - return; - audit_log_format(ab, "auid=%u ses=%u op=%s", - from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current), op); - audit_log_format(ab, " path="); - audit_log_untrustedstring(ab, w->path); - audit_log_key(ab, r->filterkey); - audit_log_format(ab, " list=%d res=1", r->listnr); - audit_log_end(ab); - } + struct audit_buffer *ab; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (!ab) + return; + audit_log_format(ab, "auid=%u ses=%u op=%s", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current), op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + audit_log_key(ab, r->filterkey); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); } /* Update inode info in audit rules based on filesystem event. */ -- cgit v1.2.3 From 0cc3cd21657be04cb0559fe8063f2130493f92cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Jun 2018 16:05:48 +0200 Subject: cpu/hotplug: Boot HT siblings at least once Due to the way Machine Check Exceptions work on X86 hyperthreads it's required to boot up _all_ logical cores at least once in order to set the CR4.MCE bit. So instead of ignoring the sibling threads right away, let them boot up once so they can configure themselves. After they came out of the initial boot stage check whether its a "secondary" sibling and cancel the operation which puts the CPU back into offline state. Reported-by: Dave Hansen Signed-off-by: Thomas Gleixner Tested-by: Tony Luck --- kernel/cpu.c | 72 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d29fdd7e57bb..6f3a3cde8b83 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -60,6 +60,7 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; + bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -342,6 +343,40 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_HOTPLUG_SMT +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_control = CPU_SMT_DISABLED; + if (str && !strcmp(str, "force")) { + pr_info("SMT: Force disabled\n"); + cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } + return 0; +} +early_param("nosmt", smt_cmdline_disable); + +static inline bool cpu_smt_allowed(unsigned int cpu) +{ + if (cpu_smt_control == CPU_SMT_ENABLED) + return true; + + if (topology_is_primary_thread(cpu)) + return true; + + /* + * On x86 it's required to boot all logical CPUs at least once so + * that the init code can get a chance to set CR4.MCE on each + * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * core will shutdown the machine. + */ + return !per_cpu(cpuhp_state, cpu).booted_once; +} +#else +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +#endif + static inline enum cpuhp_state cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -422,6 +457,16 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); + /* + * SMT soft disabling on X86 requires to bring the CPU out of the + * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The + * CPU marked itself as booted_once in cpu_notify_starting() so the + * cpu_smt_allowed() check will now return false if this is not the + * primary sibling. + */ + if (!cpu_smt_allowed(cpu)) + return -ECANCELED; + if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; @@ -933,29 +978,6 @@ EXPORT_SYMBOL(cpu_down); #define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ -#ifdef CONFIG_HOTPLUG_SMT -enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; - -static int __init smt_cmdline_disable(char *str) -{ - cpu_smt_control = CPU_SMT_DISABLED; - if (str && !strcmp(str, "force")) { - pr_info("SMT: Force disabled\n"); - cpu_smt_control = CPU_SMT_FORCE_DISABLED; - } - return 0; -} -early_param("nosmt", smt_cmdline_disable); - -static inline bool cpu_smt_allowed(unsigned int cpu) -{ - return cpu_smt_control == CPU_SMT_ENABLED || - topology_is_primary_thread(cpu); -} -#else -static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } -#endif - /** * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started @@ -970,6 +992,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + st->booted_once = true; while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2180,5 +2203,6 @@ void __init boot_cpu_init(void) */ void __init boot_cpu_state_init(void) { - per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; + this_cpu_write(cpuhp_state.booted_once, true); + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit v1.2.3 From 6fec64e1c92d5c715c6d0f50786daa7708266bde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 Jun 2018 15:21:31 +0200 Subject: posix-timers: Make forward callback return s64 The posix timer ti_overrun handling is broken because the forwarding functions can return a huge number of overruns which does not fit in an int. As a consequence timer_getoverrun(2) and siginfo::si_overrun can turn into random number generators. As a first step to address that let the timer_forward() callbacks return the full 64 bit value. Cast it to (int) temporarily until k_itimer::ti_overrun is converted to 64bit and the conversion to user space visible values is sanitized. Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Peter Zijlstra Cc: Michael Kerrisk Link: https://lkml.kernel.org/r/20180626132704.922098090@linutronix.de --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/posix-timers.c | 6 +++--- kernel/time/posix-timers.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 639321bf2e39..78a3cc555823 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -581,11 +581,11 @@ static void alarm_timer_rearm(struct k_itimer *timr) * @timr: Pointer to the posixtimer data struct * @now: Current time to forward the timer against */ -static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) +static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return (int) alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, timr->it_interval, now); } /** diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 80d59333c76e..db1d65963a57 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -645,11 +645,11 @@ static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) return __hrtimer_expires_remaining_adjusted(timer, now); } -static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = &timr->it.real.timer; - return (int)hrtimer_forward(timer, now, timr->it_interval); + return hrtimer_forward(timer, now, timr->it_interval); } /* @@ -702,7 +702,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * expiry time forward by intervals, so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) - timr->it_overrun += kc->timer_forward(timr, now); + timr->it_overrun += (int)kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 151e28f5bf30..ddb21145211a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -19,7 +19,7 @@ struct k_clock { void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); void (*timer_rearm)(struct k_itimer *timr); - int (*timer_forward)(struct k_itimer *timr, ktime_t now); + s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); int (*timer_try_to_cancel)(struct k_itimer *timr); void (*timer_arm)(struct k_itimer *timr, ktime_t expires, -- cgit v1.2.3 From 78c9c4dfbf8c04883941445a195276bb4bb92c76 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 Jun 2018 15:21:32 +0200 Subject: posix-timers: Sanitize overrun handling The posix timer overrun handling is broken because the forwarding functions can return a huge number of overruns which does not fit in an int. As a consequence timer_getoverrun(2) and siginfo::si_overrun can turn into random number generators. The k_clock::timer_forward() callbacks return a 64 bit value now. Make k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal accounting is correct. 3Remove the temporary (int) casts. Add a helper function which clamps the overrun value returned to user space via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value between 0 and INT_MAX. INT_MAX is an indicator for user space that the overrun value has been clamped. Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Peter Zijlstra Cc: Michael Kerrisk Link: https://lkml.kernel.org/r/20180626132705.018623573@linutronix.de --- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..562cc3891b57 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -85,7 +85,7 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) continue; timer->it.cpu.expires += incr; - timer->it_overrun += 1 << i; + timer->it_overrun += 1LL << i; delta -= incr; } } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index db1d65963a57..3ac7295306dc 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -283,6 +283,17 @@ static __init int init_posix_timers(void) } __initcall(init_posix_timers); +/* + * The siginfo si_overrun field and the return value of timer_getoverrun(2) + * are of type int. Clamp the overrun value to INT_MAX + */ +static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +{ + s64 sum = timr->it_overrun_last + (s64)baseval; + + return sum > (s64)INT_MAX ? INT_MAX : (int)sum; +} + static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; @@ -290,9 +301,8 @@ static void common_hrtimer_rearm(struct k_itimer *timr) if (!timr->it_interval) return; - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), + timr->it_interval); hrtimer_restart(timer); } @@ -321,10 +331,10 @@ void posixtimer_rearm(struct siginfo *info) timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; + timr->it_overrun = -1LL; ++timr->it_requeue_pending; - info->si_overrun += timr->it_overrun_last; + info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } unlock_timer(timr, flags); @@ -418,9 +428,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) now = ktime_add(now, kj); } #endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, - timr->it_interval); + timr->it_overrun += hrtimer_forward(timer, now, + timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; @@ -524,7 +533,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; - new_timer->it_overrun = -1; + new_timer->it_overrun = -1LL; if (event) { rcu_read_lock(); @@ -702,7 +711,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * expiry time forward by intervals, so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) - timr->it_overrun += (int)kc->timer_forward(timr, now); + timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ @@ -791,7 +800,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) if (!timr) return -EINVAL; - overrun = timr->it_overrun_last; + overrun = timer_overrun_to_int(timr, 0); unlock_timer(timr, flags); return overrun; -- cgit v1.2.3 From 5f936e19cc0ef97dbe3a56e9498922ad5ba1edef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Jul 2018 09:34:29 +0200 Subject: alarmtimer: Prevent overflow for relative nanosleep Air Icy reported: UBSAN: Undefined behaviour in kernel/time/alarmtimer.c:811:7 signed integer overflow: 1529859276030040771 + 9223372036854775807 cannot be represented in type 'long long int' Call Trace: alarm_timer_nsleep+0x44c/0x510 kernel/time/alarmtimer.c:811 __do_sys_clock_nanosleep kernel/time/posix-timers.c:1235 [inline] __se_sys_clock_nanosleep kernel/time/posix-timers.c:1213 [inline] __x64_sys_clock_nanosleep+0x326/0x4e0 kernel/time/posix-timers.c:1213 do_syscall_64+0xb8/0x3a0 arch/x86/entry/common.c:290 alarm_timer_nsleep() uses ktime_add() to add the current time and the relative expiry value. ktime_add() has no sanity checks so the addition can overflow when the relative timeout is large enough. Use ktime_add_safe() which has the necessary sanity checks in place and limits the result to the valid range. Fixes: 9a7adcf5c6de ("timers: Posix interface for alarm-timers") Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Cc: John Stultz Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1807020926360.1595@nanos.tec.linutronix.de --- kernel/time/alarmtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 78a3cc555823..fa5de5e8de61 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -808,7 +808,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); - exp = ktime_add(now, exp); + + exp = ktime_add_safe(now, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); -- cgit v1.2.3 From d5641c64c48f4408500a348301bff01fbf2c1ec5 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 25 Jun 2018 13:30:58 +0800 Subject: PM / hibernate: cast PAGE_SIZE to int when comparing with error code If PAGE_SIZE is unsigned type then negative error code will be larger than PAGE_SIZE. Signed-off-by: Chengguang Xu Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c2bcf97d24c8..d7f6c1a288d3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -923,7 +923,7 @@ int swsusp_write(unsigned int flags) } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); - if (error < PAGE_SIZE) { + if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; @@ -1483,7 +1483,7 @@ int swsusp_read(unsigned int *flags_p) memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); - if (error < PAGE_SIZE) + if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); -- cgit v1.2.3 From ea272257ccba1e57b9448444e1e3de39794ad6b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 26 Jun 2018 06:49:11 -0300 Subject: docs: histogram.txt: convert it to ReST file format Despite being mentioned at Documentation/trace/ftrace.rst as a rst file, this file was still a text one, with several issues. Convert it to ReST and add it to the trace index: - Mark the document title as such; - Identify and indent the literal blocks; - Use the proper markups for table. Signed-off-by: Mauro Carvalho Chehab Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dcc0166d1997..2bd4a9181a0f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -605,7 +605,7 @@ config HIST_TRIGGERS Inter-event tracing of quantities such as latencies is also supported using hist triggers under this option. - See Documentation/trace/histogram.txt. + See Documentation/trace/histogram.rst. If in doubt, say N. config MMIOTRACE_TEST -- cgit v1.2.3 From 9cf57731b63e37ed995b46690adc604891a9a28f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:52:03 +0200 Subject: watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work Oleg suggested to replace the "watchdog/%u" threads with cpu_stop_work. That removes one thread per CPU while at the same time fixes softlockup vs SCHED_DEADLINE. But more importantly, it does away with the single smpboot_update_cpumask_percpu_thread() user, which allows cleanups/shrinkage of the smpboot interface. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 ++ kernel/watchdog.c | 137 ++++++++++++++++++++++++------------------------------ 2 files changed, 65 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..191097c45fb1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_AP_WATCHDOG_ONLINE] = { + .name = "lockup_detector:online", + .startup.single = lockup_detector_online_cpu, + .teardown.single = lockup_detector_offline_cpu, + }, [CPUHP_AP_WORKQUEUE_ONLINE] = { .name = "workqueue:online", .startup.single = workqueue_online_cpu, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 576d18045811..b81f777838d5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -18,18 +18,14 @@ #include #include #include -#include -#include -#include #include -#include #include #include #include +#include #include #include -#include static DEFINE_MUTEX(watchdog_mutex); @@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void) unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -static bool softlockup_threads_initialized __read_mostly; +static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); @@ -335,6 +330,25 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +/* + * The watchdog thread function - touches the timestamp. + * + * It only runs once every sample_period seconds (4 seconds by + * default) to reset the softlockup timestamp. If this gets delayed + * for more than 2*watchdog_thresh seconds then the debug-printout + * triggers in watchdog_timer_fn(). + */ +static int softlockup_fn(void *data) +{ + __this_cpu_write(soft_lockup_hrtimer_cnt, + __this_cpu_read(hrtimer_interrupts)); + __touch_watchdog(); + + return 0; +} + +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -350,7 +364,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - wake_up_process(__this_cpu_read(softlockup_watchdog)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -448,17 +464,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void watchdog_set_prio(unsigned int policy, unsigned int prio) -{ - struct sched_param param = { .sched_priority = prio }; - - sched_setscheduler(current, policy, ¶m); -} - static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -473,15 +484,14 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); - - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - watchdog_set_prio(SCHED_NORMAL, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + /* * Disable the perf event first. That prevents that a large delay * between disabling the timer and disabling the perf event causes @@ -491,77 +501,63 @@ static void watchdog_disable(unsigned int cpu) hrtimer_cancel(hrtimer); } -static void watchdog_cleanup(unsigned int cpu, bool online) +static int softlockup_stop_fn(void *data) { - watchdog_disable(cpu); + watchdog_disable(smp_processor_id()); + return 0; } -static int watchdog_should_run(unsigned int cpu) +static void softlockup_stop_all(void) { - return __this_cpu_read(hrtimer_interrupts) != - __this_cpu_read(soft_lockup_hrtimer_cnt); + int cpu; + + if (!softlockup_initialized) + return; + + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); + + cpumask_clear(&watchdog_allowed_mask); } -/* - * The watchdog thread function - touches the timestamp. - * - * It only runs once every sample_period seconds (4 seconds by - * default) to reset the softlockup timestamp. If this gets delayed - * for more than 2*watchdog_thresh seconds then the debug-printout - * triggers in watchdog_timer_fn(). - */ -static void watchdog(unsigned int cpu) +static int softlockup_start_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); - __touch_watchdog(); + watchdog_enable(smp_processor_id()); + return 0; } -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .cleanup = watchdog_cleanup, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - -static void softlockup_update_smpboot_threads(void) +static void softlockup_start_all(void) { - lockdep_assert_held(&watchdog_mutex); - - if (!softlockup_threads_initialized) - return; + int cpu; - smpboot_update_cpumask_percpu_thread(&watchdog_threads, - &watchdog_allowed_mask); + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + for_each_cpu(cpu, &watchdog_allowed_mask) + smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } -/* Temporarily park all watchdog threads */ -static void softlockup_park_all_threads(void) +int lockup_detector_online_cpu(unsigned int cpu) { - cpumask_clear(&watchdog_allowed_mask); - softlockup_update_smpboot_threads(); + watchdog_enable(cpu); + return 0; } -/* Unpark enabled threads */ -static void softlockup_unpark_threads(void) +int lockup_detector_offline_cpu(unsigned int cpu) { - cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); - softlockup_update_smpboot_threads(); + watchdog_disable(cpu); + return 0; } static void lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); - softlockup_park_all_threads(); + + softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) - softlockup_unpark_threads(); + softlockup_start_all(); + watchdog_nmi_start(); cpus_read_unlock(); /* @@ -580,8 +576,6 @@ static void lockup_detector_reconfigure(void) */ static __init void lockup_detector_setup(void) { - int ret; - /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. @@ -592,24 +586,13 @@ static __init void lockup_detector_setup(void) !(watchdog_enabled && watchdog_thresh)) return; - ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_allowed_mask); - if (ret) { - pr_err("Failed to initialize soft lockup detector threads\n"); - return; - } - mutex_lock(&watchdog_mutex); - softlockup_threads_initialized = true; lockup_detector_reconfigure(); + softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static inline int watchdog_park_threads(void) { return 0; } -static inline void watchdog_unpark_threads(void) { } -static inline int watchdog_enable_all_cpus(void) { return 0; } -static inline void watchdog_disable_all_cpus(void) { } static void lockup_detector_reconfigure(void) { cpus_read_lock(); -- cgit v1.2.3 From 167a88677b05d6a810f23b871cfb2b5db1808e60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:53:01 +0200 Subject: smpboot: Remove cpumask from the API Now that the sole use of the whole smpboot_*cpumask() API is gone, remove it. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/smpboot.c | 54 +++++------------------------------------------------- 1 file changed, 5 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 5043e7433f4b..c230c2dd48e1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu) mutex_lock(&smpboot_threads_lock); list_for_each_entry(cur, &hotplug_threads, list) - if (cpumask_test_cpu(cpu, cur->cpumask)) - smpboot_unpark_thread(cur, cpu); + smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); return 0; } @@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * smpboot_register_percpu_thread - Register a per_cpu thread related * to hotplug * @plug_thread: Hotplug thread descriptor - * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, - const struct cpumask *cpumask) +int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) { unsigned int cpu; int ret = 0; - if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) - return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpumask); - get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); - free_cpumask_var(plug_thread->cpumask); goto out; } - if (cpumask_test_cpu(cpu, cpumask)) - smpboot_unpark_thread(plug_thread, cpu); + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -315,7 +306,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug @@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) smpboot_destroy_threads(plug_thread); mutex_unlock(&smpboot_threads_lock); put_online_cpus(); - free_cpumask_var(plug_thread->cpumask); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); -/** - * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked - * @plug_thread: Hotplug thread descriptor - * @new: Revised mask to use - * - * The cpumask field in the smp_hotplug_thread must not be updated directly - * by the client, but only by calling this function. - * This function can only be called on a registered smp_hotplug_thread. - */ -void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) -{ - struct cpumask *old = plug_thread->cpumask; - static struct cpumask tmp; - unsigned int cpu; - - lockdep_assert_cpus_held(); - mutex_lock(&smpboot_threads_lock); - - /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(&tmp, old, new); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_park_thread(plug_thread, cpu); - - /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(&tmp, new, old); - for_each_cpu_and(cpu, &tmp, cpu_online_mask) - smpboot_unpark_thread(plug_thread, cpu); - - cpumask_copy(old, new); - - mutex_unlock(&smpboot_threads_lock); -} - static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); /* -- cgit v1.2.3 From f83ee19be4272564ad592ef90145db7295229490 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 10:55:56 +0200 Subject: kthread: Simplify kthread_park() completion Oleg explains the reason we could hit park+park is that smpboot_update_cpumask_percpu_thread()'s for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_kthread(); turns into: for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) smpboot_park_kthread(); on UP, ignoring the mask. But since we just completely removed that function, this is no longer relevant. So revert commit: b1f5b378e126 ("kthread: Allow kthread_park() on a parked kthread") Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/kthread.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 750cb8082694..11b591ee51ab 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self) if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; - complete_all(&self->parked); + complete(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -465,7 +465,6 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); - reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. @@ -493,6 +492,9 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) + return -EBUSY; + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); -- cgit v1.2.3 From 55f036ca7e74b85e34958af3d22121c656796413 Mon Sep 17 00:00:00 2001 From: Peter Ziljstra Date: Fri, 15 Jun 2018 10:07:12 +0200 Subject: locking: WW mutex cleanup Make the WW mutex code more readable by adding comments, splitting up functions and pointing out that we're actually using the Wait-Die algorithm. Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Gustavo Padovan Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Greg Kroah-Hartman Cc: linux-doc@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Co-authored-by: Thomas Hellstrom Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Hellstrom Acked-by: Ingo Molnar --- kernel/locking/mutex.c | 202 ++++++++++++++++++++++++++++++------------------- 1 file changed, 125 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index f44f658ae629..cfe48419b7d0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -244,6 +244,17 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { @@ -282,26 +293,53 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); #endif ww_ctx->acquired++; + ww->ctx = ww_ctx; } +/* + * Determine if context @a is 'after' context @b. IOW, @a is a younger + * transaction than @b and depending on algorithm either needs to wait for + * @b or die. + */ static inline bool __sched __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { - return a->stamp - b->stamp <= LONG_MAX && - (a->stamp != b->stamp || a > b); + + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a younger waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool __sched +__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (waiter->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, waiter); + wake_up_process(waiter->task); + } + + return true; } /* - * Wake up any waiters that may have to back off when the lock is held by the - * given context. + * We just acquired @lock under @ww_ctx, if there are later contexts waiting + * behind us on the wait-list, check if they need to die. * - * Due to the invariants on the wait list, this can only affect the first - * waiter with a context. + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. * * The current task must not be on the wait list. */ static void __sched -__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { struct mutex_waiter *cur; @@ -311,30 +349,23 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (cur->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - - break; + if (__ww_mutex_die(lock, cur, ww_ctx)) + break; } } /* - * After acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; - /* * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be + * the WAITERS check is done, otherwise contended waiters might be * missed. The contended waiters will either see ww_ctx == NULL * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. @@ -348,29 +379,14 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) return; /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die. */ spin_lock(&lock->base.wait_lock); - __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + __ww_mutex_check_waiters(&lock->base, ctx); spin_unlock(&lock->base.wait_lock); } -/* - * After acquiring lock in the slowpath set ctx. - * - * Unlike for the fast path, the caller ensures that waiters are woken up where - * necessary. - * - * Callers must hold the mutex wait_lock. - */ -static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; -} - #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline @@ -646,37 +662,73 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) } EXPORT_SYMBOL(ww_mutex_unlock); + +static __always_inline int __sched +__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + + +/* + * Check whether we need to kill the transaction for the current lock acquire. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) +__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); struct mutex_waiter *cur; + if (ctx->acquired == 0) + return 0; + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - goto deadlock; + return __ww_mutex_kill(lock, ctx); /* * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must back off. + * stamp is earlier than ours and we must kill ourself. */ cur = waiter; list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (cur->ww_ctx) - goto deadlock; + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); } return 0; - -deadlock: -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; } +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting. + */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, @@ -693,7 +745,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. + * them. Wait-Die waiters may die here. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -701,34 +753,27 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, continue; if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* Back off immediately if necessary. */ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + int ret = __ww_mutex_kill(lock, ww_ctx); - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; - } + if (ret) + return ret; break; } pos = &cur->list; - /* - * Wake up the waiter so that it gets a chance to back - * off. - */ - if (cur->ww_ctx->acquired > 0) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); } list_add_tail(&waiter->list, pos); + return 0; } @@ -772,7 +817,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ if (__mutex_trylock(lock)) { if (use_ww_ctx && ww_ctx) - __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; } @@ -790,10 +835,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, waiter.ww_ctx = MUTEX_POISON_WW_CTX; #endif } else { - /* Add in stamp order, waking up waiters that must back off. */ + /* + * Add in stamp order, waking up waiters that must kill + * themselves. + */ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) - goto err_early_backoff; + goto err_early_kill; waiter.ww_ctx = ww_ctx; } @@ -815,7 +863,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto acquired; /* - * Check for signals and wound conditions while holding + * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ @@ -824,8 +872,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); + if (use_ww_ctx && ww_ctx) { + ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } @@ -870,7 +918,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); if (use_ww_ctx && ww_ctx) - ww_mutex_set_context_slowpath(ww, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); spin_unlock(&lock->wait_lock); preempt_enable(); @@ -879,7 +927,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); -err_early_backoff: +err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); -- cgit v1.2.3 From 08295b3b5beec9aac0f7a9db86f0fc3792039da3 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 15 Jun 2018 10:17:38 +0200 Subject: locking: Implement an algorithm choice for Wound-Wait mutexes The current Wound-Wait mutex algorithm is actually not Wound-Wait but Wait-Die. Implement also Wound-Wait as a per-ww-class choice. Wound-Wait is, contrary to Wait-Die a preemptive algorithm and is known to generate fewer backoffs. Testing reveals that this is true if the number of simultaneous contending transactions is small. As the number of simultaneous contending threads increases, Wait-Wound becomes inferior to Wait-Die in terms of elapsed time. Possibly due to the larger number of held locks of sleeping transactions. Update documentation and callers. Timings using git://people.freedesktop.org/~thomash/ww_mutex_test tag patch-18-06-15 Each thread runs 100000 batches of lock / unlock 800 ww mutexes randomly chosen out of 100000. Four core Intel x86_64: Algorithm #threads Rollbacks time Wound-Wait 4 ~100 ~17s. Wait-Die 4 ~150000 ~19s. Wound-Wait 16 ~360000 ~109s. Wait-Die 16 ~450000 ~82s. Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Gustavo Padovan Cc: Maarten Lankhorst Cc: Sean Paul Cc: David Airlie Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Kate Stewart Cc: Philippe Ombredanne Cc: Greg Kroah-Hartman Cc: linux-doc@vger.kernel.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Co-authored-by: Peter Zijlstra Signed-off-by: Thomas Hellstrom Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- kernel/locking/locktorture.c | 2 +- kernel/locking/mutex.c | 165 ++++++++++++++++++++++++++++++++++++----- kernel/locking/test-ww_mutex.c | 2 +- 3 files changed, 150 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8402b3349dca..c28224347d69 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -365,7 +365,7 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include -static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WD_CLASS(torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cfe48419b7d0..1a81a1257b3f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -173,6 +173,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; } +/* + * Add @waiter to a given location in the lock wait_list and set the + * FLAG_WAITERS flag if it's the first waiter. + */ +static void __sched +__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct list_head *list) +{ + debug_mutex_add_waiter(lock, waiter, current); + + list_add_tail(&waiter->list, list); + if (__mutex_waiter_is_first(lock, waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); +} + /* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves @@ -249,6 +264,11 @@ EXPORT_SYMBOL(mutex_lock); * The newer transactions are killed when: * It (the new transaction) makes a request for a lock being held * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. */ /* @@ -320,6 +340,9 @@ static bool __sched __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, struct ww_acquire_ctx *ww_ctx) { + if (!ww_ctx->is_wait_die) + return false; + if (waiter->ww_ctx->acquired > 0 && __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { debug_mutex_wake_waiter(lock, waiter); @@ -329,13 +352,65 @@ __ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, return true; } +/* + * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with older transactions than + * the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __mutex_owner(lock); + + lockdep_assert_held(&lock->wait_lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_lock_check_stamp() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; +} + /* * We just acquired @lock under @ww_ctx, if there are later contexts waiting - * behind us on the wait-list, check if they need to die. + * behind us on the wait-list, check if they need to die, or wound us. * * See __ww_mutex_add_waiter() for the list-order construction; basically the * list is ordered by stamp, smallest (oldest) first. * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. + * * The current task must not be on the wait list. */ static void __sched @@ -349,7 +424,8 @@ __ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (__ww_mutex_die(lock, cur, ww_ctx)) + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) break; } } @@ -370,17 +446,23 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. */ - smp_mb(); /* ^^^ */ + smp_mb(); /* See comments above and below. */ /* - * Check if lock is contended, if not there is nobody to wake up + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. */ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* * Uh oh, we raced in fastpath, check if any of the waiters need to - * die. + * die or wound us. */ spin_lock(&lock->base.wait_lock); __ww_mutex_check_waiters(&lock->base, ctx); @@ -682,7 +764,9 @@ __ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) /* - * Check whether we need to kill the transaction for the current lock acquire. + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. * * Wait-Die: If we're trying to acquire a lock already held by an older * context, kill ourselves. @@ -701,6 +785,13 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, if (ctx->acquired == 0) return 0; + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) return __ww_mutex_kill(lock, ctx); @@ -727,7 +818,8 @@ __ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, * Waiters without context are interspersed in FIFO order. * * Furthermore, for Wait-Die kill ourself immediately when possible (there are - * older contexts already waiting) to avoid unnecessary waiting. + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, @@ -736,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, { struct mutex_waiter *cur; struct list_head *pos; + bool is_wait_die; if (!ww_ctx) { - list_add_tail(&waiter->list, &lock->wait_list); + __mutex_add_waiter(lock, waiter, &lock->wait_list); return 0; } + is_wait_die = ww_ctx->is_wait_die; + /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. Wait-Die waiters may die here. + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -758,10 +855,12 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, * is no point in queueing behind it, as we'd have to * die the moment it would acquire the lock. */ - int ret = __ww_mutex_kill(lock, ww_ctx); + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); - if (ret) - return ret; + if (ret) + return ret; + } break; } @@ -772,7 +871,23 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, __ww_mutex_die(lock, cur, ww_ctx); } - list_add_tail(&waiter->list, pos); + __mutex_add_waiter(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + + /* + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. + */ + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); + } return 0; } @@ -796,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; } preempt_disable(); @@ -829,7 +952,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, &lock->wait_list); + #ifdef CONFIG_DEBUG_MUTEXES waiter.ww_ctx = MUTEX_POISON_WW_CTX; @@ -848,9 +972,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, waiter.task = current; - if (__mutex_waiter_is_first(lock, &waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - set_current_state(state); for (;;) { /* @@ -907,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, acquired: __set_current_state(TASK_RUNNING); + if (use_ww_ctx && ww_ctx) { + /* + * Wound-Wait; we stole the lock (!first_waiter), check the + * waiters as anyone might want to wound us. + */ + if (!ww_ctx->is_wait_die && + !__mutex_waiter_is_first(lock, &waiter)) + __ww_mutex_check_waiters(lock, ww_ctx); + } + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0e4cd64ad2c0..5b915b370d5a 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -26,7 +26,7 @@ #include #include -static DEFINE_WW_CLASS(ww_class); +static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; struct test_mutex { -- cgit v1.2.3 From c72051d5778a9c0e5df31d6553a6fa3507b3685c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:58:24 +0200 Subject: audit: use ktime_get_coarse_ts64() for time access The API got renamed for consistency with the other time accessors, this changes the audit caller as well. Signed-off-by: Arnd Bergmann Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0f3222e4edde..e17bc697d11c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1721,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = current_kernel_time64(); + ktime_get_coarse_ts64(t); *serial = audit_serial(); } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d762e0b8160e..f6a0cb32d76e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1540,10 +1540,10 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->argv[2] = a3; context->argv[3] = a4; context->serial = 0; - context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; + ktime_get_coarse_ts64(&context->ctime); } /** -- cgit v1.2.3 From 26acfb666a473d960f0fd971fe68f3e3ad16c70b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Jun 2018 11:29:53 -0400 Subject: x86/KVM: Warn user if KVM is loaded SMT and L1TF CPU bug being present If the L1TF CPU bug is present we allow the KVM module to be loaded as the major of users that use Linux and KVM have trusted guests and do not want a broken setup. Cloud vendors are the ones that are uncomfortable with CVE 2018-3620 and as such they are the ones that should set nosmt to one. Setting 'nosmt' means that the system administrator also needs to disable SMT (Hyper-threading) in the BIOS, or via the 'nosmt' command line parameter, or via the /sys/devices/system/cpu/smt/control. See commit 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT"). Other mitigations are to use task affinity, cpu sets, interrupt binding, etc - anything to make sure that _only_ the same guests vCPUs are running on sibling threads. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6f3a3cde8b83..5a00ebdf98c6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -345,6 +345,7 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +EXPORT_SYMBOL_GPL(cpu_smt_control); static int __init smt_cmdline_disable(char *str) { -- cgit v1.2.3 From 215af5499d9e2b55f111d2431ea20218115f29b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Jul 2018 11:40:18 +0200 Subject: cpu/hotplug: Online siblings when SMT control is turned on Writing 'off' to /sys/devices/system/cpu/smt/control offlines all SMT siblings. Writing 'on' merily enables the abilify to online them, but does not online them automatically. Make 'on' more useful by onlining all offline siblings. Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5a00ebdf98c6..d79e24df2420 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1979,6 +1979,15 @@ static void cpuhp_offline_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2011,11 +2020,24 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static void cpuhp_smt_enable(void) +static int cpuhp_smt_enable(void) { + int cpu, ret = 0; + cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } cpu_maps_update_done(); + return ret; } static ssize_t @@ -2046,7 +2068,7 @@ store_smt_control(struct device *dev, struct device_attribute *attr, if (ctrlval != cpu_smt_control) { switch (ctrlval) { case CPU_SMT_ENABLED: - cpuhp_smt_enable(); + ret = cpuhp_smt_enable(); break; case CPU_SMT_DISABLED: case CPU_SMT_FORCE_DISABLED: -- cgit v1.2.3 From 8599dc7dec874b137384714bfe3ddbbf5d4a614e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 17:06:41 +0200 Subject: printk: Clean up syslog_print_all() syslog_print_all() is called twice. Once with a valid buffer and once just to set the indexes. Both variants are already handled separately. This patch just makes it more obvious. It does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627150641.p56xyy6mdzvnfpig@pathway.suse.cz Cc: Sergey Senozhatsky Cc: Namit Gupta Cc: linux-kernel@vger.kernel.org Cc: pankaj.m@samsung.com Cc: a.sahrawat@samsung.com Cc: himanshu.m@samsung.com Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 16b02cc51a14..fcc1992c040a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1356,16 +1356,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 seq; u32 idx; - if (!buf) { - if (clear) { - logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; - logbuf_unlock_irq(); - } - return 0; - } - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1437,6 +1427,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return len; } +static void syslog_clear(void) +{ + logbuf_lock_irq(); + clear_seq = log_next_seq; + clear_idx = log_next_idx; + logbuf_unlock_irq(); +} + int do_syslog(int type, char __user *buf, int len, int source) { bool clear = false; @@ -1481,7 +1479,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: - syslog_print_all(NULL, 0, true); + syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: -- cgit v1.2.3 From ba552399954dde1b388f7749fecad5c349216981 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:08:15 +0200 Subject: printk: Split the code for storing a message into the log buffer It is just a preparation step. The patch does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627140817.27764-2-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..a844c611b17c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1824,28 +1824,16 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } -asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, va_list args) +/* Must be called under logbuf_lock. */ +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; enum log_flags lflags = 0; - unsigned long flags; - int printed_len; - bool in_sched = false; - - if (level == LOGLEVEL_SCHED) { - level = LOGLEVEL_DEFAULT; - in_sched = true; - } - - boot_delay_msec(level); - printk_delay(); - /* This stops the holder of console_sem just where we want him */ - logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1886,8 +1874,29 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); + return log_output(facility, level, lflags, + dict, dictlen, text, text_len); +} +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) +{ + int printed_len; + bool in_sched = false; + unsigned long flags; + + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; + in_sched = true; + } + + boot_delay_msec(level); + printk_delay(); + + /* This stops the holder of console_sem just where we want him */ + logbuf_lock_irqsave(flags); + printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ -- cgit v1.2.3 From a338f84dc196f44b63ba0863d2f34fd9b1613572 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:08:16 +0200 Subject: printk: Create helper function to queue deferred console handling It is just a preparation step. The patch does not change the existing behavior. Link: http://lkml.kernel.org/r/20180627140817.27764-3-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a844c611b17c..1d1513215c22 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2887,16 +2887,20 @@ void wake_up_klogd(void) preempt_enable(); } -int vprintk_deferred(const char *fmt, va_list args) +void defer_console_output(void) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); +} + +int vprintk_deferred(const char *fmt, va_list args) +{ + int r; + + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + defer_console_output(); return r; } -- cgit v1.2.3 From 03fc7f9c99c1e7ae2925d459e8487f1a6f199f79 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 27 Jun 2018 16:20:28 +0200 Subject: printk/nmi: Prevent deadlock when accessing the main log buffer in NMI The commit 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available") brought back the possible deadlocks in printk() and NMI. The check of logbuf_lock is done only in printk_nmi_enter() to prevent mixed output. But another CPU might take the lock later, enter NMI, and: + Both NMIs might be serialized by yet another lock, for example, the one in nmi_cpu_backtrace(). + The other CPU might get stopped in NMI, see smp_send_stop() in panic(). The only safe solution is to use trylock when storing the message into the main log-buffer. It might cause reordering when some lines go to the main lock buffer directly and others are delayed via the per-CPU buffer. It means that it is not useful in general. This patch replaces the problematic NMI deferred context with NMI direct context. It can be used to mark a code that might produce many messages in NMI and the risk of losing them is more critical than problems with eventual reordering. The context is then used when dumping trace buffers on oops. It was the primary motivation for the original fix. Also the reordering is even smaller issue there because some traces have their own time stamps. Finally, nmi_cpu_backtrace() need not longer be serialized because it will always us the per-CPU buffers again. Fixes: 719f6a7040f1bdaf96 ("printk: Use the main logbuf in NMI when logbuf_lock is available") Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20180627142028.11259-1-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/internal.h | 9 ++++++- kernel/printk/printk_safe.c | 58 +++++++++++++++++++++++++++++---------------- kernel/trace/trace.c | 4 +++- 3 files changed, 48 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a7d04049af4..0f1898820cba 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -19,11 +19,16 @@ #ifdef CONFIG_PRINTK #define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 #define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(5, 0) +int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args); + __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); @@ -54,6 +59,8 @@ void __printk_safe_exit(void); local_irq_enable(); \ } while (0) +void defer_console_output(void); + #else __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d7d091309054..a0a74c533e4b 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,24 +308,33 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - /* - * The size of the extra per-CPU buffer is limited. Use it only when - * the main one is locked. If this CPU is not in the safe context, - * the lock must be taken on another CPU and we could wait for it. - */ - if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && - raw_spin_is_locked(&logbuf_lock)) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); - } else { - this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); - } + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } void printk_nmi_exit(void) { - this_cpu_and(printk_context, - ~(PRINTK_NMI_CONTEXT_MASK | - PRINTK_NMI_DEFERRED_CONTEXT_MASK)); + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} + +/* + * Marks a code that might produce many messages in NMI context + * and the risk of losing them is more critical than eventual + * reordering. + * + * It has effect only when called in NMI context. Then printk() + * will try to store the messages into the main logbuf directly + * and use the per-CPU buffers only as a fallback when the lock + * is not available. + */ +void printk_nmi_direct_enter(void) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +} + +void printk_nmi_direct_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); } #else @@ -363,6 +372,20 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* + * Try to use the main logbuf even in NMI. But avoid calling console + * drivers that might have their own locks. + */ + if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && + raw_spin_trylock(&logbuf_lock)) { + int len; + + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + raw_spin_unlock(&logbuf_lock); + defer_console_output(); + return len; + } + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); @@ -371,13 +394,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); - /* - * Use the main logbuf when logbuf_lock is available in NMI. - * But avoid calling console drivers that might have their own locks. - */ - if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) - return vprintk_deferred(fmt, args); - /* No obstacles. */ return vprintk_default(fmt, args); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bcd93031d042..f106ad12f72f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8265,6 +8265,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); + printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -8344,7 +8345,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); + atomic_dec(&dump_running); + printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); -- cgit v1.2.3 From e1a413245a564683697a3d02ec197b72cf009b89 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:08 +0800 Subject: Blktrace: bail out early if block debugfs is not configured Since @blk_debugfs_root couldn't be configured dynamically, we can save a few memory allocation if it's not there. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..b951aa1fac61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; + if (!blk_debugfs_root) + return -ENOENT; + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - if (!blk_debugfs_root) - goto err; - dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); -- cgit v1.2.3 From b061c7a513afe14a68af41cec7c3476befc40e95 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Mon, 4 Jun 2018 15:34:21 +0200 Subject: timekeeping: Update multiplier when NTP frequency is set directly When the NTP frequency is set directly from userspace using the ADJ_FREQUENCY or ADJ_TICK timex mode, immediately update the timekeeper's multiplier instead of waiting for the next tick. This removes a hidden non-deterministic delay in setting of the frequency and allows an extremely tight control of the system clock with update rates close to or even exceeding the kernel HZ. The update is limited to archs using modern timekeeping (!ARCH_USES_GETTIMEOFFSET). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Miroslav Lichvar Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4786df904c22..67720110dc33 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -34,6 +34,14 @@ #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) +enum timekeeping_adv_mode { + /* Update timekeeper when a tick has passed */ + TK_ADV_TICK, + + /* Update timekeeper on a direct frequency change */ + TK_ADV_FREQ +}; + /* * The most important data for readout fits into a single 64 byte * cache line. @@ -2021,11 +2029,11 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, return offset; } -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * +/* + * timekeeping_advance - Updates the timekeeper to the current time and + * current NTP tick length */ -void update_wall_time(void) +static void timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2042,14 +2050,17 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; + + if (mode != TK_ADV_TICK) + goto out; #else offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); -#endif /* Check if there's really nothing to do */ - if (offset < real_tk->cycle_interval) + if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) goto out; +#endif /* Do some additional sanity checking */ timekeeping_check_update(tk, offset); @@ -2105,6 +2116,15 @@ out: clock_was_set_delayed(); } +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + */ +void update_wall_time(void) +{ + timekeeping_advance(TK_ADV_TICK); +} + /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set @@ -2327,6 +2347,10 @@ int do_adjtimex(struct timex *txc) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + timekeeping_advance(TK_ADV_FREQ); + if (tai != orig_tai) clock_was_set(); -- cgit v1.2.3 From ffaa619af1b062d5a37871df6363aa52356007a7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 10 Jul 2018 10:44:53 +0200 Subject: printk: Fix warning about unused suppress_message_printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit suppress_message_printing() is not longer called in console_unlock(). Therefore it is not longer needed with disabled CONFIG_PRINTK. This fixes the warning: kernel/printk/printk.c:2033:13: warning: ‘suppress_message_printing’ defined but not used [-Wunused-function] static bool suppress_message_printing(int level) { return false; } Reported-by: Stephen Rothwell Reported-by: Arnd Bergmann Suggested-by: Maninder Singh Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fcc1992c040a..e2cb0fc18e2d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2021,7 +2021,6 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ -- cgit v1.2.3 From 7a6e55375d5c584c7c11cfdcaa8ad9d6cccb419d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 Jul 2018 16:41:18 +0200 Subject: hrtimer: Improve kernel message printing - Join split message for easier grepping, - Use pr_*() instead of printk*(), - Use %u to format unsigned cpu numbers. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180712144118.8819-1-geert+renesas@glider.be --- kernel/time/hrtimer.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 055a4a728c00..5c2af16c7c14 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -718,8 +718,8 @@ static void hrtimer_switch_to_hres(void) struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", base->cpu); + pr_warn("Could not switch to high resolution mode on CPU %u\n", + base->cpu); return; } base->hres_active = 1; @@ -1573,8 +1573,7 @@ retry: else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); - printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", - ktime_to_ns(delta)); + pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } /* called with interrupts disabled */ -- cgit v1.2.3 From fcc63543650150629c8a873cbef3578770acecd9 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 15 Jun 2018 12:06:31 -0700 Subject: rcu: Make expedited GPs handle CPU 0 being offline Currently, the parallelized initialization of expedited grace periods uses the workqueue associated with each rcu_node structure's ->grplo field. This works fine unless that CPU is offline. This commit therefore uses the CPU corresponding to the lowest-numbered online CPU, or just queues the work on WORK_CPU_UNBOUND if there are no online CPUs corresponding to this rcu_node structure. Note that this patch uses cpu_is_offline() instead of the usual approach of checking bits in the rcu_node structure's ->qsmaskinitnext field. This is safe because preemption is disabled across both the cpu_is_offline() check and the call to queue_work_on(). Signed-off-by: Boqun Feng [ paulmck: Disable preemption to close offline race window. ] Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra feedback on CPU selection. ] Tested-by: Aneesh Kumar K.V --- kernel/rcu/tree_exp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c6385ee1af65..b3df3b770afb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -472,6 +472,7 @@ retry_ipi: static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, smp_call_func_t func) { + int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); @@ -493,7 +494,13 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_disable(); + cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi)) + cpu = WORK_CPU_UNBOUND; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + preempt_enable(); rnp->exp_need_flush = true; } -- cgit v1.2.3 From 26d950a9451336a6b5abc1c8ca6c21df58e8d89f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 21 Apr 2018 20:44:11 -0700 Subject: rcu: Diagnostics for grace-period startup hangs This commit causes a splat if RCU is idle and a request for a new grace period is ignored for more than one second. This splat normally indicates that some code path asked for a new grace period, but failed to wake up the RCU grace-period kthread. Signed-off-by: Paul E. McKenney [ paulmck: Fix bug located by Dan Carpenter and his static checker. ] [ paulmck: Fix self-deadlock bug located 0day test robot. ] [ paulmck: Disable unless CONFIG_PROVE_RCU=y. ] --- kernel/rcu/tree.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b1fffa21b9e4..6ce82c009195 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1681,6 +1681,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; @@ -2113,6 +2114,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Advance CBs to reduce false positives below. */ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } @@ -2744,6 +2746,65 @@ static void force_quiescent_state(struct rcu_state *rsp) rcu_gp_kthread_wake(rsp); } +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void +rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(rsp); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || + rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + time_before(j, rsp->gp_req_activity + HZ) || + time_before(j, rsp->gp_activity + HZ) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, READ_ONCE(rsp->gpnum), + need_future_gp_element(rcu_get_root(rsp), 0), + need_future_gp_element(rcu_get_root(rsp), 1), + need_future_gp_element(rcu_get_root(rsp), 2), + need_future_gp_element(rcu_get_root(rsp), 3), + j - rsp->gp_req_activity, j - rsp->gp_activity, + rsp->gp_flags, rsp->name, + rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to @@ -2755,7 +2816,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; WARN_ON_ONCE(!rdp->beenonline); @@ -2769,7 +2830,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { local_irq_restore(flags); } else { - rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2778,6 +2838,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) } } + rcu_check_gp_start_stall(rsp, rnp, rdp); + /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7365ac53fdd9..3c1942174c56 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -374,6 +374,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long gp_activity; /* Time of last GP kthread */ /* activity in jiffies. */ + unsigned long gp_req_activity; /* Time of last GP request */ + /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ -- cgit v1.2.3 From 18390aeae7010ee56b132dcfb663ba362a099d99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Apr 2018 15:06:05 -0700 Subject: rcu: Make rcu_gp_cleanup() write only once to ->gp_flags At the end of rcu_gp_cleanup(), if another grace period is needed, but not via rcu_accelerate_cbs(), the ->gp_flags field is written twice, once when making the new grace-period request, and once when clearing all other types of requests. This commit therefore adds an else-clause to avoid this double write. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ce82c009195..a9a4a260ea7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2117,8 +2117,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_req_activity = jiffies; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + } else { + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); } - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } -- cgit v1.2.3 From de30ad512a668b56e7ad7a5a7c379d7c5d138a94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 11:52:09 -0700 Subject: rcu: Introduce grace-period sequence numbers This commit adds grace-period sequence numbers (->gp_seq) to the rcu_state, rcu_node, and rcu_data structures, and updates them. It also checks for consistency between rsp->gpnum and rsp->gp_seq. These ->gp_seq counters will eventually replace the existing ->gpnum and ->completed counters, allowing a single memory access to determine whether or not a grace period is in progress and if so, which one. This in turn will enable changes that will reduce ->lock contention on the leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++++++++++++++- kernel/rcu/tree.h | 3 +++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a9a4a260ea7d..467cd8e5c6ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,6 +97,7 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -1849,6 +1850,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } + if (rdp->gp_seq != rnp->gp_seq) + rdp->gp_seq = rnp->gp_seq; return ret; } @@ -1910,7 +1913,10 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); /* Record GP times before starting GP, hence smp_store_release(). */ + WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); smp_store_release(&rsp->gpnum, rsp->gpnum + 1); + smp_mb(); /* Pairs with barriers in stall-warning code. */ + rcu_seq_start(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1984,6 +1990,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rnp->gpnum, rsp->gpnum); if (WARN_ON_ONCE(rnp->completed != rsp->completed)) WRITE_ONCE(rnp->completed, rsp->completed); + WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -2050,6 +2057,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; + unsigned long new_gp_seq; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2079,12 +2087,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * all of the rcu_node structures before the beginning of the next * grace period is recorded in any of the rcu_node structures. */ + new_gp_seq = rsp->gp_seq; + rcu_seq_end(&new_gp_seq); rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); + WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; @@ -2098,10 +2109,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); + rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ @@ -3612,6 +3624,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; + rdp->gp_seq = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3991,6 +4004,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) &rcu_fqs_class[i], fqs[i]); rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; + rnp->gp_seq = rsp->gp_seq; rnp->completedqs = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3c1942174c56..50a28d1cf5a1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -87,6 +87,7 @@ struct rcu_node { unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -211,6 +212,7 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ @@ -343,6 +345,7 @@ struct rcu_state { /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ + unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ -- cgit v1.2.3 From dee4f42298bba030e84035aca5c114f9fee8fa6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Apr 2018 15:30:28 -0700 Subject: rcu: Move rcu_gp_slow() to ->gp_seq This commit moves rcu_gp_slow() to ->gp_seq. This function only uses the grace-period number to modulate delay, so rcu_seq_ctr(rsp->gp_seq) gets the same effect, at least in cases where the delay is to happen more than four times per wrap of an unsigned long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 467cd8e5c6ff..3c3af7e2758f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1879,7 +1879,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_gp_slow(struct rcu_state *rsp, int delay) { if (delay > 0 && - !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + !(rcu_seq_ctr(rsp->gp_seq) % + (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } -- cgit v1.2.3 From 17ef2fe97c8c8e754e4a702c42f8e5b0ffadf4dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 11:39:34 -0700 Subject: rcu: Make rcutorture's batches-completed API use ->gp_seq The rcutorture test invokes rcu_batches_started(), rcu_batches_completed(), rcu_batches_started_bh(), rcu_batches_completed_bh(), rcu_batches_started_sched(), and rcu_batches_completed_sched() to do grace-period consistency checks, and rcuperf uses the _completed variants for statistics. These functions use ->gpnum and ->completed. This commit therefore replaces them with rcu_get_gp_seq(), rcu_bh_get_gp_seq(), and rcu_sched_get_gp_seq(), adjusting rcutorture and rcuperf to make use of them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 18 ++++++----------- kernel/rcu/rcuperf.c | 26 +++++++++---------------- kernel/rcu/rcutorture.c | 50 +++++++++++++++++------------------------------- kernel/rcu/tree.c | 51 ++++++++++++------------------------------------- 4 files changed, 45 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index db0870acfdff..f0907f9f6cd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -463,12 +463,9 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU -static inline unsigned long rcu_batches_started(void) { return 0; } -static inline unsigned long rcu_batches_started_bh(void) { return 0; } -static inline unsigned long rcu_batches_started_sched(void) { return 0; } -static inline unsigned long rcu_batches_completed(void) { return 0; } -static inline unsigned long rcu_batches_completed_bh(void) { return 0; } -static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } +static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long @@ -480,12 +477,9 @@ static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_get_gp_seq(void); +unsigned long rcu_bh_get_gp_seq(void); +unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index df29119b2013..2b5a613afcf3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -138,8 +138,7 @@ struct rcu_perf_ops { void (*cleanup)(void); int (*readlock)(void); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,8 +178,7 @@ static struct rcu_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -209,8 +207,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .init = rcu_sync_perf_init, .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -266,8 +263,7 @@ static struct rcu_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -295,8 +291,7 @@ static struct rcu_perf_ops srcud_ops = { .cleanup = srcu_sync_perf_cleanup, .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, - .started = NULL, - .completed = srcu_perf_completed, + .get_gp_seq = srcu_perf_completed, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -325,8 +320,7 @@ static struct rcu_perf_ops sched_ops = { .init = rcu_sync_perf_init, .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -353,8 +347,7 @@ static struct rcu_perf_ops tasks_ops = { .init = rcu_sync_perf_init, .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -447,8 +440,7 @@ rcu_perf_writer(void *arg) b_rcu_perf_writer_started = cur_ops->exp_completed() / 2; } else { - b_rcu_perf_writer_started = - cur_ops->completed(); + b_rcu_perf_writer_started = cur_ops->get_gp_seq(); } } @@ -505,7 +497,7 @@ retry: cur_ops->exp_completed() / 2; } else { b_rcu_perf_writer_finished = - cur_ops->completed(); + cur_ops->get_gp_seq(); } if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5604bfac8df4..1f66597c7783 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -264,8 +264,7 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - unsigned long (*started)(void); - unsigned long (*completed)(void); + unsigned long (*get_gp_seq)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -305,10 +304,10 @@ static void rcu_read_delay(struct torture_random_state *rrsp) * force_quiescent_state. */ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); mdelay(longdelay_ms); - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } @@ -400,8 +399,7 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .started = rcu_batches_started, - .completed = rcu_batches_completed, + .get_gp_seq = rcu_get_gp_seq, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -442,8 +440,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .started = rcu_batches_started_bh, - .completed = rcu_batches_completed_bh, + .get_gp_seq = rcu_bh_get_gp_seq, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -486,8 +483,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, @@ -575,8 +571,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -613,8 +608,7 @@ static struct rcu_torture_ops srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, - .started = NULL, - .completed = srcu_torture_completed, + .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -651,8 +645,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .started = rcu_batches_started_sched, - .completed = rcu_batches_completed_sched, + .get_gp_seq = rcu_sched_get_gp_seq, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -690,8 +683,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, - .started = rcu_no_completed, - .completed = rcu_no_completed, + .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, @@ -1104,10 +1096,7 @@ static void rcu_torture_timer(struct timer_list *unused) unsigned long long ts; idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1131,7 +1120,7 @@ static void rcu_torture_timer(struct timer_list *unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1139,8 +1128,8 @@ static void rcu_torture_timer(struct timer_list *unused) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1187,10 +1176,7 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - if (cur_ops->started) - started = cur_ops->started(); - else - started = cur_ops->completed(); + started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1212,7 +1198,7 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed = cur_ops->completed(); + completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); @@ -1220,8 +1206,8 @@ rcu_torture_reader(void *arg) } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; - if (cur_ops->started) - completed++; + if (completed > ULONG_MAX >> 1) + completed = 0; /* Not all gp_seq have full range. */ if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c3af7e2758f..547112bec26a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -530,58 +530,31 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU batches started thus far for debug & stats. + * Return the number of RCU GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started(void) +unsigned long rcu_get_gp_seq(void) { - return rcu_state_p->gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started); +EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched batches started thus far for debug & stats. + * Return the number of RCU-sched GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_sched(void) +unsigned long rcu_sched_get_gp_seq(void) { - return rcu_sched_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_sched); +EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); /* - * Return the number of RCU BH batches started thus far for debug & stats. + * Return the number of RCU-bh GPs completed thus far for debug & stats. */ -unsigned long rcu_batches_started_bh(void) +unsigned long rcu_bh_get_gp_seq(void) { - return rcu_bh_state.gpnum; + return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); } -EXPORT_SYMBOL_GPL(rcu_batches_started_bh); - -/* - * Return the number of RCU batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed(void) -{ - return rcu_state_p->completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU-sched batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches completed thus far for debug & stats. - */ -unsigned long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); /* * Return the number of RCU expedited batches completed thus far for -- cgit v1.2.3 From 78c5a67f1788f59d991c8e78d674ad3c8542f0ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 13:32:28 -0700 Subject: rcu: Convert rcu_check_gp_kthread_starvation() to GP sequence number This commit switches rcu_check_gp_kthread_starvation() from printing ->gpnum and ->completed to printing ->gp_seq upon detecting a starving RCU grace-period kthread during an RCU CPU stall warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 547112bec26a..002f85357226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1276,9 +1276,9 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, + (long)rcu_seq_current(&rsp->gp_seq), rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0, -- cgit v1.2.3 From c9a24e2d0c7d33b141167f5fa13f95cf6d35cb1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:54:46 -0700 Subject: rcu: Make quiescent-state reporting use ->gp_seq This commit switches the functions reporting quiescent states from use of ->gpnum to ->gp_seq. In either case, the point is to handle races where a given grace period ends before a quiescent state can be reported. Failing to catch these races would result in too-short grace periods, hence the checking. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 002f85357226..a54587dc13f0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2242,7 +2242,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * must be represented by the same rcu_node structure (which need not be a * leaf rcu_node structure, though it often will be). The gps parameter * is the grace-period snapshot, which means that the quiescent states - * are valid only if rnp->gpnum is equal to gps. That structure's lock + * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. */ static void @@ -2257,7 +2257,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { + if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the @@ -2335,8 +2335,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy, tracking current ->gpnum. */ - gps = rnp->gpnum; + /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ + gps = rnp->gp_seq; mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ @@ -2357,8 +2357,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum || rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2383,7 +2383,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); @@ -2688,8 +2688,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) } } if (mask != 0) { - /* Idle/offline CPUs, report (releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* Idle/offline CPUs, report (releases rnp->lock). */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -- cgit v1.2.3 From e4be81a2ed3a7356a2c22c7571af622ceb57eb2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 15:16:50 -0700 Subject: rcu: Convert conditional grace-period primitives to ->gp_seq This commit converts get_state_synchronize_rcu(), cond_synchronize_rcu(), get_state_synchronize_sched(), and cond_synchronize_sched() from ->gpnum and ->completed to ->gp_seq. Note that this also introduces a full memory barrier in the already-done paths off cond_synchronize_rcu() and cond_synchronize_sched(), as work with LKMM indicates that the earlier smp_load_acquire() were insufficiently strong in some situations where these two functions were called just as the grace period ended. In such cases, these two functions would not gain the benefit of memory ordering at the end of the grace period. Please note that the performance impact is negligible, as you shouldn't be using either function anywhere near a fastpath in any case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a54587dc13f0..fd2f582a6db0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3183,16 +3183,10 @@ unsigned long get_state_synchronize_rcu(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_rcu() - * and cond_synchronize_rcu(). - */ - return smp_load_acquire(&rcu_state_p->gpnum); + return rcu_seq_snap(&rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3212,15 +3206,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_state_p->completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) synchronize_rcu(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); @@ -3235,16 +3224,10 @@ unsigned long get_state_synchronize_sched(void) { /* * Any prior manipulation of RCU-protected data must happen - * before the load from ->gpnum. + * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - - /* - * Make sure this load happens before the purportedly - * time-consuming work between get_state_synchronize_sched() - * and cond_synchronize_sched(). - */ - return smp_load_acquire(&rcu_sched_state.gpnum); + return rcu_seq_snap(&rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_sched); @@ -3264,15 +3247,10 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_sched); */ void cond_synchronize_sched(unsigned long oldstate) { - unsigned long newstate; - - /* - * Ensure that this load happens before any RCU-destructive - * actions the caller might carry out after we return. - */ - newstate = smp_load_acquire(&rcu_sched_state.completed); - if (ULONG_CMP_GE(oldstate, newstate)) + if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) synchronize_sched(); + else + smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -- cgit v1.2.3 From 67e14c1e39d2d956300b3d6ad00f7708e3285531 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 16:01:46 -0700 Subject: rcu: Move RCU's grace-period-change code to ->gp_seq This commit moves __note_gp_changes(), note_gp_changes(), and __rcu_pending() to ->gp_seq, creating new rcu_seq_completed_gp() and rcu_seq_new_gp() functions for this purpose. Signed-off-by: Paul E. McKenney [ paulmck: Reinstate "cpuend: trace as suggested by Joel Fernandes. ] --- kernel/rcu/rcu.h | 17 +++++++++++++++++ kernel/rcu/tree.c | 39 ++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f0907f9f6cd0..7568a3fd0815 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -116,6 +116,23 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } +/* + * Has a grace period completed since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT(old, new & ~RCU_SEQ_STATE_MASK); +} + +/* + * Has a grace period started since the time the old gp_seq was collected? + */ +static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) +{ + return ULONG_CMP_LT((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK, + new); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fd2f582a6db0..bcd659e65dfd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1790,24 +1790,23 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, raw_lockdep_assert_held_rcu_node(rnp); - /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed && - !unlikely(READ_ONCE(rdp->gpwrap))) { - - /* No grace period end, so just accelerate recent callbacks. */ - ret = rcu_accelerate_cbs(rsp, rnp, rdp); - - } else { - - /* Advance callbacks. */ - ret = rcu_advance_cbs(rsp, rnp, rdp); + if (rdp->gp_seq == rnp->gp_seq) + return false; /* Nothing to do. */ + /* Handle the ends of any preceding grace periods first. */ + if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); + } else { + ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ } - if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { + /* Now handle the beginnings of any new-to-this-CPU grace periods. */ + if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || + unlikely(READ_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1823,8 +1822,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); } - if (rdp->gp_seq != rnp->gp_seq) - rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ return ret; } @@ -1836,8 +1834,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; - if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && - rdp->completed == READ_ONCE(rnp->completed) && + if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); @@ -3286,12 +3283,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; - /* Has another RCU grace period completed? */ - if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ - return 1; - - /* Has a new RCU grace period started? */ - if (READ_ONCE(rnp->gpnum) != rdp->gpnum || + /* Have RCU grace period completed or started? */ + if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; -- cgit v1.2.3 From a66ae8ae35de60a85d2d89689d8c2d6a4dc15d85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:06:08 -0700 Subject: rcu: Convert rcu_gpnum_ovf() to ->gp_seq This commit converts rcu_gpnum_ovf() to use ->gp_seq instead of ->gpnum. Same size unsigned long, so same approach. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcd659e65dfd..de2e2c5d64bb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1089,14 +1089,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow - * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no * code whatsoever. */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, + rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; -- cgit v1.2.3 From e05720b0977bd50707ea6cf296f99e709de3f760 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 18:58:58 -0700 Subject: rcu: Move rcu_implicit_dynticks_qs() to ->gp_seq This commit makes rcu_implicit_dynticks_qs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index de2e2c5d64bb..b3af3d24286c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit v1.2.3 From 03c8cb765a747c02fd8d3fade1efe9d529ad54bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 19:56:16 -0700 Subject: rcu: Move rcu_try_advance_all_cbs() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq, with the exception of tracing, which will be converted later. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e387ea712758..d893899b72f4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1467,7 +1467,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if ((rdp->completed != rnp->completed || + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || unlikely(READ_ONCE(rdp->gpwrap))) && rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); -- cgit v1.2.3 From e0da2374c3881cb9a512e2718f9ca655a48de9db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 20:51:36 -0700 Subject: rcu: Move rcu_nocb_gp_get() to ->gp_seq This commit makes rcu_try_advance_all_cbs() use ->gp_seq. It uses rcu_seq_ctr() in order to shift away the state bits, so that the low-order bits of the result may safely be used to index ->nocb_gp_wq[]. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d893899b72f4..5b10904669c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1852,7 +1852,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { - return &rnp->nocb_gp_wq[rnp->completed & 0x1]; + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; } static void rcu_init_one_nocb(struct rcu_node *rnp) -- cgit v1.2.3 From ba04107fc901ddce49686944f7e038b4f0b0a359 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 21:25:01 -0700 Subject: rcu: Move rcu_gp_in_progress() to ->gp_seq This commit makes rcu_gp_in_progress() use ->gp_seq instead of ->completed and ->gpnum. The READ_ONCE() invocations are buried in rcu_seq_current(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3af3d24286c..56445f4c09a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -219,7 +219,7 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) */ static int rcu_gp_in_progress(struct rcu_state *rsp) { - return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); + return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); } /* -- cgit v1.2.3 From 8aa670cdacc1820cb0597e4b4b413ef91ede2dd9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 14:15:40 -0700 Subject: rcu: Convert ->rcu_iw_gpnum to ->gp_seq This commit switches the interrupt-disabled detection mechanism to ->gp_seq. This mechanism is used as part of RCU CPU stall warnings, and detects cases where the stall is due to a CPU having interrupts disabled. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 56445f4c09a8..2ddbd1cfb31a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1099,8 +1099,8 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, rnp->gp_seq)) WRITE_ONCE(rdp->gpwrap, true); - if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) - rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; + if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) + rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* @@ -1134,7 +1134,7 @@ static void rcu_iw_handler(struct irq_work *iwp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; rdp->rcu_iw_pending = false; } raw_spin_unlock_rcu_node(rnp); @@ -1231,11 +1231,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); if (IS_ENABLED(CONFIG_IRQ_WORK) && - !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); rdp->rcu_iw_pending = true; - rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } } @@ -3575,7 +3575,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gpnum = rnp->gpnum - 1; + rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 50a28d1cf5a1..6d6cbc8b3a9c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -286,7 +286,7 @@ struct rcu_data { /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ - unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ + unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5b10904669c5..bc32e1f434a6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1763,7 +1763,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], -- cgit v1.2.3 From d43a5d32e125db1e34641922e52baaa4fee3510a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 28 Apr 2018 18:50:06 -0700 Subject: rcu: Convert ->completedqs to ->gp_seq This commit switches the quiescent-state no-backtracking checks from ->gpnum and ->completed to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ddbd1cfb31a..da2ca9cc078b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2278,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->completedqs = rnp->gpnum; + rnp->completedqs = rnp->gp_seq; mask = rnp->grpmask; if (rnp->parent == NULL) { @@ -3951,7 +3951,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; - rnp->completedqs = rsp->completed; + rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bc32e1f434a6..2a81d243521f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -262,7 +262,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { rnp->gp_tasks = &t->rcu_node_entry; - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum); + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; @@ -537,7 +537,7 @@ void rcu_read_unlock_special(struct task_struct *t) WARN_ON_ONCE(rnp != t->rcu_blocked_node); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - WARN_ON_ONCE(rnp->completedqs == rnp->gpnum && + WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit v1.2.3 From 29365e563b1e4e5bfde211280d37dc6127c019ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 10:57:36 -0700 Subject: rcu: Convert grace-period requests to ->gp_seq This commit converts the grace-period request code paths from ->completed and ->gpnum to ->gp_seq. The need_future_gp_element() macro encapsulates the shift operation required to use ->gp_seq as an index to the ->need_future_gp[] array. The rcu_cbs_completed() function is removed in favor of the rcu_seq_snap() function. The rcu_start_this_gp() gets some temporary consistency checks and uses rcu_seq_done(), rcu_seq_current(), rcu_seq_state(), and rcu_gp_in_progress() in place of the earlier open-coded comparisons of ->gpnum and ->completed. The rcu_future_gp_cleanup() function replaces use of ->completed with ->gp_seq. The rcu_accelerate_cbs() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap(). The rcu_advance_cbs() function replaces an access to >completed with one to ->gp_seq and adds some temporary warnings. The rcu_nocb_wait_gp() function replaces a call to rcu_cbs_completed() with one to rcu_seq_snap() and an open-coded comparison with rcu_seq_done(). The temporary warnings will be removed when the various ->gpnum and ->completed fields are removed. Their purpose is to locate code who might still be using ->gpnum and ->completed. (Much easier that way than trying to trace down the causes of too-short grace periods and grace-period hangs!) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 84 ++++++++++++------------------------------------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 6 ++-- 3 files changed, 24 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da2ca9cc078b..ffa45b6175d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1548,52 +1548,6 @@ void rcu_cpu_stall_reset(void) WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } -/* - * Determine the value that ->completed will have at the end of the - * next subsequent grace period. This is used to tag callbacks so that - * a CPU can invoke callbacks in a timely fashion even if that CPU has - * been dyntick-idle for an extended period with callbacks under the - * influence of RCU_FAST_NO_HZ. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static unsigned long rcu_cbs_completed(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * If RCU is idle, we just wait for the next grace period. - * But we can only be sure that RCU is idle if we are looking - * at the root rcu_node structure -- otherwise, a new grace - * period might have started, but just not yet gotten around - * to initializing the current non-root rcu_node structure. - */ - if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) - return rnp->completed + 1; - - /* - * If the current rcu_node structure believes that RCU is - * idle, and if the rcu_state structure does not yet reflect - * the start of a new grace period, then the next grace period - * will suffice. The memory barrier is needed to accurately - * sample the rsp->gpnum, and pairs with the second lock - * acquisition in rcu_gp_init(), which is augmented with - * smp_mb__after_unlock_lock() for this purpose. - */ - if (rnp->gpnum == rnp->completed) { - smp_mb(); /* See above block comment. */ - if (READ_ONCE(rsp->gpnum) == rnp->completed) - return rnp->completed + 1; - } - - /* - * Otherwise, wait for a possible partial grace period and - * then the subsequent full grace period. - */ - return rnp->completed + 2; -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) @@ -1629,16 +1583,16 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ + WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + - need_future_gp_mask(), c)); if (need_future_gp_element(rnp_root, c) || - ULONG_CMP_GE(rnp_root->gpnum, c) || + rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && - rnp_root->gpnum != rnp_root->completed)) { + rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } @@ -1650,7 +1604,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* If GP already in progress, just leave, otherwise start one. */ - if (rnp_root->gpnum != rnp_root->completed) { + if (rcu_gp_in_progress(rsp)) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } @@ -1675,7 +1629,7 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->completed; + unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); @@ -1703,14 +1657,14 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) } /* - * If there is room, assign a ->completed number to any callbacks on - * this CPU that have not already been assigned. Also accelerate any - * callbacks that were previously assigned a ->completed number that has - * since proven to be too conservative, which can happen if callbacks get - * assigned a ->completed number while RCU is idle, but with reference to - * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. Returns an flag saying that we should - * awaken the RCU grace-period kthread. + * If there is room, assign a ->gp_seq number to any callbacks on this + * CPU that have not already been assigned. Also accelerate any callbacks + * that were previously assigned a ->gp_seq number that has since proven + * to be too conservative, which can happen if callbacks get assigned a + * ->gp_seq number while RCU is idle, but with reference to a non-root + * rcu_node structure. This function is idempotent, so it does not hurt + * to call it repeatedly. Returns an flag saying that we should awaken + * the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ @@ -1736,7 +1690,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_cbs_completed(rsp, rnp); + c = rcu_seq_snap(&rsp->gp_seq); if (rcu_segcblist_accelerate(&rdp->cblist, c)) ret = rcu_start_this_gp(rnp, rdp, c); @@ -1751,7 +1705,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and - * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... * Returns true if the RCU grace-period kthread needs to be awakened. @@ -1768,10 +1722,10 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return false; /* - * Find all callbacks whose ->completed numbers indicate that they + * Find all callbacks whose ->gp_seq numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - rcu_segcblist_advance(&rdp->cblist, rnp->completed); + rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1889,6 +1843,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) smp_store_release(&rsp->gpnum, rsp->gpnum + 1); smp_mb(); /* Pairs with barriers in stall-warning code. */ rcu_seq_start(&rsp->gp_seq); + if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ + pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6d6cbc8b3a9c..a21d403a6010 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,7 +174,7 @@ struct rcu_node { #define need_future_gp_mask() \ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) #define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) + ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) #define need_any_future_gp(rnp) \ ({ \ int __i; \ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2a81d243521f..2036dc7426ac 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2105,7 +2105,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - c = rcu_cbs_completed(rdp->rsp, rnp); + c = rcu_seq_snap(&rdp->rsp->gp_seq); needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) @@ -2118,8 +2118,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( - rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); + rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], + (d = rcu_seq_done(&rnp->gp_seq, c))); if (likely(d)) break; WARN_ON(signal_pending(current)); -- cgit v1.2.3 From 471f87c3d91bd0884451c0e3071473476c165630 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Apr 2018 13:09:17 -0700 Subject: rcu: Make RCU CPU stall warnings use ->gp_seq This commit makes the RCU CPU stall-warning code in print_other_cpu_stall(), print_cpu_stall(), and check_cpu_stall() use ->gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++------------------------ kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ffa45b6175d9..9e619c4878d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1340,7 +1340,7 @@ static inline void panic_on_rcu_stall(void) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) { int cpu; unsigned long flags; @@ -1350,6 +1350,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1380,17 +1382,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); } else { - if (READ_ONCE(rsp->gpnum) != gpnum || - READ_ONCE(rsp->completed) == gpnum) { + if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; @@ -1442,9 +1443,9 @@ static void print_cpu_stall(struct rcu_state *rsp) for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, cpu)->cblist); - pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", jiffies - rsp->gp_start, - (long)rsp->gpnum, (long)rsp->completed, totqlen); + (long)rcu_seq_current(&rsp->gp_seq), totqlen); rcu_check_gp_kthread_starvation(rsp); @@ -1471,8 +1472,8 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long completed; - unsigned long gpnum; + unsigned long gs1; + unsigned long gs2; unsigned long gps; unsigned long j; unsigned long jn; @@ -1488,28 +1489,28 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. + * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, + * then rsp->gp_start, and finally another copy of rsp->gp_seq. + * These values are updated in the opposite order with memory + * barriers (or equivalent) during grace-period initialization + * and cleanup. Now, a false positive can occur if we get an new + * value of rsp->gp_start and a old value of rsp->jiffies_stall. + * But given the memory barriers, the only way that this can happen + * is if one grace period ends and another starts between these + * two fetches. This is detected by comparing the second fetch + * of rsp->gp_seq with the previous fetch from rsp->gp_seq. * * Given this check, comparisons of jiffies, rsp->jiffies_stall, * and rsp->gp_start suffice to forestall false positives. */ - gpnum = READ_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ + gs1 = READ_ONCE(rsp->gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ js = READ_ONCE(rsp->jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ gps = READ_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = READ_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rsp->gp_seq); + if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ @@ -1527,7 +1528,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) cmpxchg(&rsp->jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gpnum); + print_other_cpu_stall(rsp, gs2); } } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2036dc7426ac..f4a88e3c388d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1755,12 +1755,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - if (rsp->gpnum == rdp->gpnum) { + ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; - } else { - ticks_title = "GPs behind"; - ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); -- cgit v1.2.3 From aebc82644b2c8eafa15e8c481fbafc1b41f4fbf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 06:42:51 -0700 Subject: rcutorture: Convert rcutorture_get_gp_data() to ->gp_seq SRCU has long used ->srcu_gp_seq, and now RCU uses ->gp_seq. This commit therefore moves the rcutorture_get_gp_data() function from a ->gpnum / ->completed pair to ->gp_seq. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 17 ++++++----------- kernel/rcu/rcutorture.c | 24 ++++++++++-------------- kernel/rcu/srcutree.c | 5 ++--- kernel/rcu/tree.c | 5 ++--- 4 files changed, 20 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7568a3fd0815..003671825d62 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -425,7 +425,7 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, @@ -435,13 +435,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) + int *flags, unsigned long *gp_seq) { *flags = 0; - *gpnum = 0; - *completed = 0; + *gp_seq = 0; } static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } @@ -461,21 +458,19 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_idx; - *gpnum = *completed; + *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); + unsigned long *gp_seq); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1f66597c7783..81fb43530d64 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1313,18 +1313,16 @@ rcu_torture_stats_print(void) if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags = 0; - unsigned long __maybe_unused gpnum = 0; - unsigned long __maybe_unused completed = 0; + unsigned long __maybe_unused gp_seq = 0; rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gpnum, &completed); + &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); + &flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - gpnum, completed, flags, + rcu_torture_writer_state, gp_seq, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { @@ -1605,8 +1603,7 @@ static void rcu_torture_cleanup(void) { int flags = 0; - unsigned long gpnum = 0; - unsigned long completed = 0; + unsigned long gp_seq = 0; int i; rcutorture_record_test_transition(); @@ -1637,11 +1634,10 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gpnum, &completed); - pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", - cur_ops->name, gpnum, completed, flags); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + pr_alert("%s: End-test grace-period state: g%lu f%#x\n", + cur_ops->name, gp_seq, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5a1a9a07b407..d6d6ea9738c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1249,13 +1249,12 @@ static void process_srcu(struct work_struct *work) void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = rcu_seq_ctr(sp->srcu_gp_seq); - *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); + *gp_seq = rcu_seq_current(&sp->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9e619c4878d3..4a528a062cd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -638,7 +638,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed) + unsigned long *gp_seq) { struct rcu_state *rsp = NULL; @@ -658,8 +658,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, if (rsp == NULL) return; *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -- cgit v1.2.3 From 7a1d0f23ad70cd4813bf4b72735ea2c26a4f53fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 10:26:57 -0700 Subject: rcu: Move from ->need_future_gp[] to ->gp_seq_needed One problem with the ->need_future_gp[] array is that the grace-period assignment of each element changes as the grace periods complete. This means that it is necessary to hold a lock when checking this array to learn if a given grace period has already been requested. This increase lock contention, which is the opposite of helpful. This commit therefore replaces the ->need_future_gp[] with a single ->gp_seq_needed value and keeps it updated in the rcu_data structure. This will enable reliable lockless checking of whether or not a given grace period has already been requested. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree.h | 19 ++----------------- 2 files changed, 20 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a528a062cd4..1ede51690e4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1560,7 +1560,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp[] field. Returns true if there + * rcu_node structure's ->gp_seq_needed field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which @@ -1589,14 +1589,14 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (need_future_gp_element(rnp_root, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || rcu_seq_done(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); goto unlock_out; } - need_future_gp_element(rnp_root, c) = true; + rnp_root->gp_seq_needed = c; if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) @@ -1633,8 +1633,9 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = false; - needmore = need_any_future_gp(rnp); + needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); + if (!needmore) + rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ trace_rcu_this_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; @@ -2046,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (need_any_future_gp(rnp)) { + if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, TPS("CleanupMore")); needgp = true; @@ -2700,8 +2701,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_node *rnp_root = rcu_get_root(rsp); static atomic_t warned = ATOMIC_INIT(0); - if (!IS_ENABLED(CONFIG_PROVE_RCU) || - rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp))) + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || @@ -2711,7 +2712,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || atomic_read(&warned)) { @@ -2723,7 +2725,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; - if (rcu_gp_in_progress(rsp) || !need_any_future_gp(rcu_get_root(rsp)) || + if (rcu_gp_in_progress(rsp) || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || time_before(j, rsp->gp_req_activity + HZ) || time_before(j, rsp->gp_activity + HZ) || atomic_xchg(&warned, 1)) { @@ -2731,12 +2734,9 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%lu %d%d%d%d gar:%lu ga:%lu f%#x %s->state:%#lx\n", - __func__, READ_ONCE(rsp->gpnum), - need_future_gp_element(rcu_get_root(rsp), 0), - need_future_gp_element(rcu_get_root(rsp), 1), - need_future_gp_element(rcu_get_root(rsp), 2), - need_future_gp_element(rcu_get_root(rsp), 3), + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + __func__, (long)READ_ONCE(rsp->gp_seq), + (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, rsp->gp_flags, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); @@ -3527,6 +3527,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; + rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; @@ -3907,6 +3908,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; + rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a21d403a6010..9329c1ff695f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -88,6 +88,7 @@ struct rcu_node { /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ @@ -160,7 +161,6 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; @@ -170,22 +170,6 @@ struct rcu_node { bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; -/* Accessors for ->need_future_gp[] array. */ -#define need_future_gp_mask() \ - (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) -#define need_future_gp_element(rnp, c) \ - ((rnp)->need_future_gp[(c >> RCU_SEQ_CTR_SHIFT) & need_future_gp_mask()]) -#define need_any_future_gp(rnp) \ -({ \ - int __i; \ - bool __nonzero = false; \ - \ - for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || \ - READ_ONCE((rnp)->need_future_gp[__i]); \ - __nonzero; \ -}) - /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -213,6 +197,7 @@ struct rcu_data { unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ -- cgit v1.2.3 From ab5e869c1f7aa30a1210f5e8a277758b0599609f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 11:07:23 -0700 Subject: rcu: Make rcu_nocb_wait_gp() check if GP already requested This commit makes rcu_nocb_wait_gp() check rdp->gp_seq_needed to see if the current CPU already knows about the needed grace period having already been requested. If so, it avoids acquiring the corresponding leaf rcu_node structure's ->lock, thus decreasing contention. This optimization is intended for cases where either multiple leader rcuo kthreads are running on the same CPU or these kthreads are running on a non-offloaded (e.g., housekeeping) CPU. Signed-off-by: Paul E. McKenney [ paulmck: Move lock release past "if" as suggested by Joel Fernandes. ] [ paulmck: Fix caching of furthest-future requested grace period. ] --- kernel/rcu/tree.c | 5 +++++ kernel/rcu/tree_plugin.h | 15 ++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1ede51690e4a..4826598867c3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1618,6 +1618,11 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: + /* Push furthest requested GP to leaf node and rcu_data structure. */ + if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + rnp->gp_seq_needed = rnp_root->gp_seq_needed; + rdp->gp_seq_needed = rnp_root->gp_seq_needed; + } if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); return ret; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f4a88e3c388d..ca73931f7b30 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2104,12 +2104,17 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + local_irq_save(flags); c = rcu_seq_snap(&rdp->rsp->gp_seq); - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + local_irq_restore(flags); + } else { + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_start_this_gp(rnp, rdp, c); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); + } /* * Wait for the grace period. Do so interruptibly to avoid messing -- cgit v1.2.3 From 477351f7829d2268769c5d545511081555066529 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 12:54:11 -0700 Subject: rcu: Convert rcu_grace_period tracepoint to gp_seq This commit makes the rcu_grace_period tracepoint use gp_seq instead of ->gpnum or ->completed. It also introduces a "cpuofl-bgp" string to less obscurely indicate when a CPU has gone offline while a grace period is waiting on it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 43 +++++++++++++++++++++---------------------- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4826598867c3..7ce85ad39af6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -234,7 +234,7 @@ void rcu_sched_qs(void) if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), + __this_cpu_read(rcu_sched_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) @@ -249,7 +249,7 @@ void rcu_bh_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gpnum), + __this_cpu_read(rcu_bh_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } @@ -1615,7 +1615,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1702,9 +1702,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } @@ -1774,7 +1774,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * go looking for one. */ rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); @@ -1851,7 +1851,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_seq_start(&rsp->gp_seq); if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); - trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1928,7 +1928,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); @@ -2048,7 +2048,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); + trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); @@ -2061,7 +2061,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); } else { WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); @@ -2087,7 +2087,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & @@ -2100,7 +2100,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("reqwaitsig")); } @@ -2119,7 +2119,7 @@ static int __noreturn rcu_gp_kthread(void *arg) jiffies + 3 * j); } trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout(rsp->gp_wq, @@ -2134,12 +2134,12 @@ static int __noreturn rcu_gp_kthread(void *arg) if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsstart")); rcu_gp_fqs(rsp, first_gp_fqs); first_gp_fqs = false; trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); @@ -2158,7 +2158,7 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + READ_ONCE(rsp->gp_seq), TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -2388,17 +2388,16 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask;) + RCU_TRACE(bool blkd;) RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask;) - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - !!(rnp->qsmask & mask), - TPS("cpuofl")); + RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + trace_rcu_grace_period(rsp->name, rnp->gp_seq, + blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); } /* @@ -3538,7 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ca73931f7b30..aca9d187c509 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,7 +301,7 @@ static void rcu_preempt_qs(void) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gpnum), + __this_cpu_read(rcu_data_p->gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ -- cgit v1.2.3 From abd13fdd9516e5baae2257721b921684ecb090d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:08:46 -0700 Subject: rcu: Convert rcu_future_grace_period tracepoint to gp_seq This commit makes the rcu_future_grace_period tracepoint use gp_seq instead of ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7ce85ad39af6..066dbaacec30 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1552,9 +1552,8 @@ void rcu_cpu_stall_reset(void) static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long c, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + rnp->level, rnp->grplo, rnp->grphi, s); } /* @@ -2053,7 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { - trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } -- cgit v1.2.3 From 598ce09480efb6b48799df60c66bac70bea5ef54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_preempt_task tracepoint to ->gp_seq This commit makes the rcu_preempt_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index aca9d187c509..02ca3b4e6a8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -350,8 +350,8 @@ static void rcu_preempt_note_context_switch(bool preempt) trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) - ? rnp->gpnum - : rnp->gpnum + 1); + ? rnp->gp_seq + : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { -- cgit v1.2.3 From 865aa1e08d8aefdfd1f5d30ecfce1b8ef8cd520a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_unlock_preempted_task tracepoint to ->gp_seq This commit makes the rcu_unlock_preempted_task tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ca3b4e6a8f..a10b0e26ce19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -545,7 +545,7 @@ void rcu_read_unlock_special(struct task_struct *t) list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -708,7 +708,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), - rnp->gpnum, t->pid); + rnp->gp_seq, t->pid); } WARN_ON_ONCE(rnp->qsmask); } -- cgit v1.2.3 From db023296f0115d2fe01fdabad54678f2b806da23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_quiescent_state_report tracepoint to ->gp_seq This commit makes the rcu_quiescent_state_report tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 066dbaacec30..7c6c11d5479c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2229,7 +2229,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a10b0e26ce19..c8a2c7760121 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -566,7 +566,7 @@ void rcu_read_unlock_special(struct task_struct *t) empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), - rnp->gpnum, + rnp->gp_seq, 0, rnp->qsmask, rnp->level, rnp->grplo, -- cgit v1.2.3 From fee5997c17562e95fb1fecc142efb2da0934baa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 13:35:20 -0700 Subject: rcu: Convert rcu_fqs tracepoint to ->gp_seq This commit makes the rcu_fqs tracepoint use ->gp_seq instead of ->gpnum. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c6c11d5479c..42e89d4f1e33 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1111,7 +1111,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1161,7 +1161,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1178,7 +1178,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { @@ -1188,7 +1188,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* Check for the CPU being offline. */ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); + trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; rcu_gpnum_ovf(rnp, rdp); return 1; -- cgit v1.2.3 From ff3bb6f4d06247508489345ee90a8a9b6f3ffd3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 14:34:08 -0700 Subject: rcu: Remove ->gpnum and ->completed Now that everything has been converted to use ->gp_seq instead of ->gpnum and ->completed, this commit removes ->gpnum and ->completed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 +++++++++++---------------------------------- kernel/rcu/tree.h | 14 +------------- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 13 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 42e89d4f1e33..0aeddc908181 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -95,8 +95,6 @@ struct rcu_state sname##_state = { \ .rda = &sname##_data, \ .call = cr, \ .gp_state = RCU_GP_IDLE, \ - .gpnum = 0UL - 300UL, \ - .completed = 0UL - 300UL, \ .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ @@ -1349,8 +1347,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; - WARN_ON_ONCE(gp_seq & 0x2); /* Remove when ->gpnum removed. */ - /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(rsp); if (rcu_cpu_stall_suppress) @@ -1582,8 +1578,6 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - WARN_ON_ONCE(c & 0x2); /* Catch any lingering use of ->gpnum. */ - WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq)); /* Catch any ->completed/->gp_seq mismatches. */ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) @@ -1757,8 +1751,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); } else { ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ @@ -1772,7 +1764,6 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; @@ -1843,13 +1834,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - /* Record GP times before starting GP, hence smp_store_release(). */ - WARN_ON_ONCE(rsp->gpnum << RCU_SEQ_CTR_SHIFT != rsp->gp_seq); - smp_store_release(&rsp->gpnum, rsp->gpnum + 1); - smp_mb(); /* Pairs with barriers in stall-warning code. */ + /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rsp->gp_seq); - if (WARN_ON_ONCE(((rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT) != rcu_seq_ctr(rnp->gp_seq))) /* Catch any ->completed/->gp_seq mismatches. */ - pr_info("%s ->completed: %#lx (%#lx) ->gp_seq %#lx (%#lx)\n", __func__, rnp->completed, (rnp->completed << RCU_SEQ_CTR_SHIFT) >> RCU_SEQ_CTR_SHIFT, rnp->gp_seq, rcu_seq_ctr(rnp->gp_seq)); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1920,9 +1906,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gpnum, rsp->gpnum); - if (WARN_ON_ONCE(rnp->completed != rsp->completed)) - WRITE_ONCE(rnp->completed, rsp->completed); WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); @@ -2012,13 +1995,13 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. + * Propagate new ->gp_seq value to rcu_node structures so that + * other CPUs don't have to wait until the start of the next grace + * period to process their callbacks. This also avoids some nasty + * RCU grace-period initialization races by forcing the end of + * the current grace period to be completely recorded in all of + * the rcu_node structures before the beginning of the next grace + * period is recorded in any of the rcu_node structures. */ new_gp_seq = rsp->gp_seq; rcu_seq_end(&new_gp_seq); @@ -2027,7 +2010,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); - WRITE_ONCE(rnp->completed, rsp->gpnum); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) @@ -2045,7 +2027,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ /* Declare grace period done. */ - WRITE_ONCE(rsp->completed, rsp->gpnum); rcu_seq_end(&rsp->gp_seq); trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); rsp->gp_state = RCU_GP_IDLE; @@ -3496,9 +3477,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. + * offline event can be happening at a given time. Note also that we can + * accept some slop in the rsp->gp_seq access due to the fact that this + * CPU cannot possibly have any RCU callbacks in flight yet. */ static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp) @@ -3527,8 +3508,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rdp->beenonline = true; /* We have now been online. */ - rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ - rdp->completed = rnp->completed; rdp->gp_seq = rnp->gp_seq; rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -3908,8 +3887,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; rnp->gp_seq = rsp->gp_seq; rnp->gp_seq_needed = rsp->gp_seq; rnp->completedqs = rsp->gp_seq; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9329c1ff695f..3def94fc9c74 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -81,12 +81,6 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gpnum; /* Current grace period for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long completed; /* Last GP completed for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed. */ unsigned long completedqs; /* All QSes done for this node. */ @@ -192,10 +186,6 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - unsigned long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ @@ -203,7 +193,7 @@ struct rcu_data { union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool gpwrap; /* Possible gpnum/completed wrap. */ + bool gpwrap; /* Possible ->gp_seq wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -328,8 +318,6 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ - unsigned long gpnum; /* Current gp number. */ - unsigned long completed; /* # of last completed gp. */ unsigned long gp_seq; /* Grace-period sequence #. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8a2c7760121..3a6e04103de1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -690,7 +690,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock * must be held by the caller. * * Also, if there are blocked tasks on the list, they automatically -- cgit v1.2.3 From e44e73ca47b47510dac491329d453d82aea1d8d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 16:29:47 -0700 Subject: rcu: Make simple callback acceleration refer to rdp->gp_seq_needed Now that the rcu_data structure contains ->gp_seq_needed, create an rcu_accelerate_cbs_unlocked() helper function that locklessly checks to see if new callbacks' required grace period has already been requested. If so, update the callback list locally and again locklessly. (Though interrupts must be and are disabled to avoid racing with conflicting updates in interrupt handlers.) Otherwise, call rcu_accelerate_cbs() as before. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0aeddc908181..5643c135fb06 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1701,6 +1701,34 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, return ret; } +/* + * Similar to rcu_accelerate_cbs(), but does not require that the leaf + * rcu_node structure's ->lock be held. It consults the cached value + * of ->gp_seq_needed in the rcu_data structure, and if that indicates + * that a new grace-period request be made, invokes rcu_accelerate_cbs() + * while holding the leaf rcu_node structure's ->lock. + */ +static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + bool needwake; + + lockdep_assert_irqs_disabled(); + c = rcu_seq_snap(&rsp->gp_seq); + if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + /* Old request still live, so mark recent callbacks. */ + (void)rcu_segcblist_accelerate(&rdp->cblist, c); + return; + } + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); +} + /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and @@ -2739,7 +2767,6 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; - bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); struct rcu_node *rnp = rdp->mynode; @@ -2752,15 +2779,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (!rcu_gp_in_progress(rsp) && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); - if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) + rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + local_irq_restore(flags); } rcu_check_gp_start_stall(rsp, rnp, rdp); @@ -2818,8 +2839,6 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { - bool needwake; - /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2846,13 +2865,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp = rdp->mynode; - - raw_spin_lock_rcu_node(rnp); - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock_rcu_node(rnp); - if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; -- cgit v1.2.3 From a2165e416878b325747f871df4b236b49bf61486 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 May 2018 07:42:20 -0700 Subject: rcu: Don't funnel-lock above leaf node if GP in progress The old grace-period start code would acquire only the leaf's rcu_node structure's ->lock if that structure believed that a grace period was in progress. The new code advances to the leaf's parent in this case, needlessly acquiring then leaf's parent's ->lock. This commit therefore checks the grace-period state after marking the leaf with the need for the specified grace period, and if the leaf believes that a grace period is in progress, takes an early exit. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney [ paulmck: Add "Startedleaf" tracing as suggested by Joel Fernandes. ] --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5643c135fb06..24a79e85b81f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1590,6 +1590,15 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, goto unlock_out; } rnp_root->gp_seq_needed = c; + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + /* + * We just marked the leaf, and a grace period + * is in progress, which means that rcu_gp_cleanup() + * will see the marking. Bail to reduce contention. + */ + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + goto unlock_out; + } if (rnp_root != rnp && rnp_root->parent != NULL) raw_spin_unlock_rcu_node(rnp_root); if (!rnp_root->parent) -- cgit v1.2.3 From 5b55072f22ba2ed136b7a1b6c5beea9ace8415a7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:40 -0700 Subject: rcu: Produce last "CleanupMore" trace only if late-breaking request Currently Tree RCU's clean-up code emits a "CleanupMore" trace event in response to late-arriving grace-period requests even if the grace period was already requested. This makes "CleanupMore" show up an extra time (in addition to once for each rcu_node structure that was previously marked with the request), and for no good reason. This commit therefore avoids emitting this trace message unless the the only request for this next grace period arrived during or after the cleanup scan of the rcu_node structures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 24a79e85b81f..73a33b82cfcd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2069,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); - if (ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { + if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; -- cgit v1.2.3 From 5ca0905f6787e930bc2a626cf1d8f69fab52acef Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 13 May 2018 20:15:41 -0700 Subject: rcu: Fix cpustart tracepoint gp_seq number The "cpustart" trace event shows a stale gp_seq. This is because it uses rdp->gp_seq, which is updated only at the end of the __note_gp_changes() function. This commit therefore instead uses rnp->gp_seq. An alternative fix would be to update rdp->gp_seq earlier, but this would break RCU's detection of the beginning of a new-to-this-CPU grace period. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73a33b82cfcd..973250503d98 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1801,7 +1801,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpustart")); + trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); -- cgit v1.2.3 From 2e3e5e55010105f9d4351f68e15dbc43402a7794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 11:53:41 -0700 Subject: rcu: Make rcu_start_this_gp() check for grace period already started In the old days of ->gpnum and ->completed, the code requesting a new grace period checked to see if that grace period had already started, bailing early if so. The new-age ->gp_seq approach instead checks whether the grace period has already finished. A compensating change pushed the requested grace period down to the bottom of the tree, thus reducing lock contention and even eliminating it in some cases. But why not further reduce contention, especially on large systems, by doing both, especially given that the cost of doing both is extremely small? This commit therefore adds a new rcu_seq_started() function that checks whether a specified grace period has already started. It then uses this new function in place of rcu_seq_done() in the rcu_start_this_gp() function's funnel locking code. Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/tree.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 003671825d62..1c5cbd9d7c97 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -107,6 +107,15 @@ static inline unsigned long rcu_seq_current(unsigned long *sp) return READ_ONCE(*sp); } +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not the + * corresponding update-side operation has started. + */ +static inline bool rcu_seq_started(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_LT((s - 1) & ~RCU_SEQ_STATE_MASK, READ_ONCE(*sp)); +} + /* * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 973250503d98..cbf2bcde5e60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1583,7 +1583,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_done(&rnp_root->gp_seq, c) || + rcu_seq_started(&rnp_root->gp_seq, c) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); -- cgit v1.2.3 From d72193123c81ae6123d108b3be2096f3f13b25a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:24:41 -0700 Subject: rcutorture: Correctly handle grace-period sequence wrap The new ->gq_seq grace-period sequence numbers must be shifted down, which give artifacts when these numbers wrap. This commit therefore enables rcutorture and rcuperf to handle grace-period sequence numbers even if they do wrap. It does this by allowing a special subtraction function to be specified, and this function subtracts before shifting. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 9 +++++++++ kernel/rcu/rcuperf.c | 18 ++++++++++++++++-- kernel/rcu/rcutorture.c | 19 +++++++++++++------ kernel/rcu/tree.c | 6 +++--- 4 files changed, 41 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1c5cbd9d7c97..aa215d6355f8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -142,6 +142,15 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) new); } +/* + * Roughly how many full grace periods have elapsed between the collection + * of the two specified grace periods? + */ +static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) +{ + return (new - old) >> RCU_SEQ_CTR_SHIFT; +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 2b5a613afcf3..b080bc4a4f45 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -139,6 +139,7 @@ struct rcu_perf_ops { int (*readlock)(void); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); unsigned long (*exp_completed)(void); void (*async)(struct rcu_head *head, rcu_callback_t func); void (*gp_barrier)(void); @@ -179,6 +180,7 @@ static struct rcu_perf_ops rcu_ops = { .readlock = rcu_perf_read_lock, .readunlock = rcu_perf_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, .async = call_rcu, .gp_barrier = rcu_barrier, @@ -208,6 +210,7 @@ static struct rcu_perf_ops rcu_bh_ops = { .readlock = rcu_bh_perf_read_lock, .readunlock = rcu_bh_perf_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_bh, .gp_barrier = rcu_barrier_bh, @@ -264,6 +267,7 @@ static struct rcu_perf_ops srcu_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -292,6 +296,7 @@ static struct rcu_perf_ops srcud_ops = { .readlock = srcu_perf_read_lock, .readunlock = srcu_perf_read_unlock, .get_gp_seq = srcu_perf_completed, + .gp_diff = rcu_seq_diff, .exp_completed = srcu_perf_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, @@ -321,6 +326,7 @@ static struct rcu_perf_ops sched_ops = { .readlock = sched_perf_read_lock, .readunlock = sched_perf_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed_sched, .async = call_rcu_sched, .gp_barrier = rcu_barrier_sched, @@ -348,6 +354,7 @@ static struct rcu_perf_ops tasks_ops = { .readlock = tasks_perf_read_lock, .readunlock = tasks_perf_read_unlock, .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, @@ -355,6 +362,13 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -577,8 +591,8 @@ rcu_perf_cleanup(void) t_rcu_perf_writer_finished - t_rcu_perf_writer_started, ngps, - b_rcu_perf_writer_finished - - b_rcu_perf_writer_started); + rcuperf_seq_diff(b_rcu_perf_writer_finished, + b_rcu_perf_writer_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 81fb43530d64..0481c7286875 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -265,6 +265,7 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -400,6 +401,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -441,6 +443,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .get_gp_seq = rcu_bh_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -646,6 +649,7 @@ static struct rcu_torture_ops sched_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .get_gp_seq = rcu_sched_get_gp_seq, + .gp_diff = rcu_seq_diff, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -695,6 +699,13 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; @@ -1127,9 +1138,7 @@ static void rcu_torture_timer(struct timer_list *unused) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1205,9 +1214,7 @@ rcu_torture_reader(void *arg) rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed - started; - if (completed > ULONG_MAX >> 1) - completed = 0; /* Not all gp_seq have full range. */ + completed = rcutorture_seq_diff(completed, started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cbf2bcde5e60..fa219eea0ae7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -532,7 +532,7 @@ static int rcu_pending(void); */ unsigned long rcu_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_state_p->gp_seq)); + return READ_ONCE(rcu_state_p->gp_seq); } EXPORT_SYMBOL_GPL(rcu_get_gp_seq); @@ -541,7 +541,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_seq); */ unsigned long rcu_sched_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_sched_state.gp_seq)); + return READ_ONCE(rcu_sched_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); @@ -550,7 +550,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); */ unsigned long rcu_bh_get_gp_seq(void) { - return rcu_seq_ctr(READ_ONCE(rcu_bh_state.gp_seq)); + return READ_ONCE(rcu_bh_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); -- cgit v1.2.3 From 3d18469a2bb3988e669d67e097eff42dd40663d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:47:30 -0700 Subject: rcu: Regularize resetting of rcu_data wrap indicator The rcu_data structure's ->gpwrap indicator is currently reset only when the CPU in question detects a new grace period. This is in theory sufficient because any CPU that has been out of action for long enough that its ->gpwrap indicator is set is guaranteed to see both the end of an old grace period and the start of a new one. However, the current code leaves a short window during which the ->gpwrap indicator has been reset but the corresponding ->gp_seq counter has not yet been brought up to date. This is harmless because interrupts are disabled, but it is likely to (at the very least) cause confusion. This commit therefore moves the resetting of ->gpwrap to follow the updating of ->gp_seq. While in the area, it also resets ->gp_seq_needed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fa219eea0ae7..161e8fb8b83f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1807,10 +1807,12 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); - WRITE_ONCE(rdp->gpwrap, false); - rcu_gpnum_ovf(rnp, rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ + if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); return ret; } -- cgit v1.2.3 From b73de91d6a4c97ed586b2a5a6ce7c6fe395d9a3b Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 20 May 2018 21:42:18 -0700 Subject: rcu: Rename the grace-period-request variables and parameters The name 'c' is used for variables and parameters holding the requested grace-period sequence number. However it is no longer very meaningful given the conversions from ->gpnum and (especially) ->completed to ->gp_seq. This commit therefore renames 'c' to 'gp_seq_req'. Previous patch discussion is at: https://patchwork.kernel.org/patch/10396579/ Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 161e8fb8b83f..8c31b1740afc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1546,13 +1546,18 @@ void rcu_cpu_stall_reset(void) /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) + unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, c, + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, rnp->level, rnp->grplo, rnp->grphi, s); } /* + * rcu_start_this_gp - Request the start of a particular grace period + * @rnp: The leaf node of the CPU from which to start. + * @rdp: The rcu_data corresponding to the CPU from which to start. + * @gp_seq_req: The gp_seq of the grace period to start. + * * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each * rcu_node structure's ->gp_seq_needed field. Returns true if there @@ -1560,9 +1565,11 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. + * + * Returns true if the GP thread needs to be awakened else false. */ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c) + unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; @@ -1578,25 +1585,27 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * not be released. */ raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, c) || - rcu_seq_started(&rnp_root->gp_seq, c) || + if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || (rnp != rnp_root && rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = c; + rnp_root->gp_seq_needed = gp_seq_req; if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, + TPS("Startedleaf")); goto unlock_out; } if (rnp_root != rnp && rnp_root->parent != NULL) @@ -1607,21 +1616,21 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(c, rnp_root->gp_seq_needed)) { + if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { rnp->gp_seq_needed = rnp_root->gp_seq_needed; rdp->gp_seq_needed = rnp_root->gp_seq_needed; } @@ -1636,14 +1645,13 @@ unlock_out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - unsigned long c = rnp->gp_seq; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); if (!needmore) rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ - trace_rcu_this_gp(rnp, rdp, c, + trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1679,7 +1687,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; + unsigned long gp_seq_req; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1698,9 +1706,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - c = rcu_seq_snap(&rsp->gp_seq); - if (rcu_segcblist_accelerate(&rdp->cblist, c)) - ret = rcu_start_this_gp(rnp, rdp, c); + gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) + ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) -- cgit v1.2.3 From df2bf8f7f776cef57e6b27690c7b78c86f259515 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:14 -0700 Subject: rcu: Use better variable names in funnel locking loop The funnel locking loop in rcu_start_this_gp uses rcu_root as a temporary variable while walking the combining tree. This causes a tiresome exercise of a code reader reminding themselves that rcu_root may not be root. Lets just call it rnp, and rename other variables as well to be more appropriate. Original patch: https://patchwork.kernel.org/patch/10396577/ Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Fix name in comment as well. ] --- kernel/rcu/tree.c | 64 +++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c31b1740afc..1af58f4b8a25 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1554,7 +1554,7 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * rcu_start_this_gp - Request the start of a particular grace period - * @rnp: The leaf node of the CPU from which to start. + * @rnp_start: The leaf node of the CPU from which to start. * @rdp: The rcu_data corresponding to the CPU from which to start. * @gp_seq_req: The gp_seq of the grace period to start. * @@ -1568,74 +1568,74 @@ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, * * Returns true if the GP thread needs to be awakened else false. */ -static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, +static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root; + struct rcu_node *rnp; /* * Use funnel locking to either acquire the root rcu_node * structure's lock or bail out if the need for this grace period - * has already been recorded -- or has already started. If there - * is already a grace period in progress in a non-leaf node, no - * recording is needed because the end of the grace period will - * scan the leaf rcu_node structures. Note that rnp->lock must - * not be released. + * has already been recorded -- or if that grace period has in + * fact already started. If there is already a grace period in + * progress in a non-leaf node, no recording is needed because the + * end of the grace period will scan the leaf rcu_node structures. + * Note that rnp_start->lock must not be released. */ - raw_lockdep_assert_held_rcu_node(rnp); - trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startleaf")); - for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); - if (ULONG_CMP_GE(rnp_root->gp_seq_needed, gp_seq_req) || - rcu_seq_started(&rnp_root->gp_seq, gp_seq_req) || - (rnp != rnp_root && - rcu_seq_state(rcu_seq_current(&rnp_root->gp_seq)))) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, + raw_lockdep_assert_held_rcu_node(rnp_start); + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); + for (rnp = rnp_start; 1; rnp = rnp->parent) { + if (rnp != rnp_start) + raw_spin_lock_rcu_node(rnp); + if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || + rcu_seq_started(&rnp->gp_seq, gp_seq_req) || + (rnp != rnp_start && + rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Prestarted")); goto unlock_out; } - rnp_root->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { + rnp->gp_seq_needed = gp_seq_req; + if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { /* * We just marked the leaf, and a grace period * is in progress, which means that rcu_gp_cleanup() * will see the marking. Bail to reduce contention. */ - trace_rcu_this_gp(rnp, rdp, gp_seq_req, + trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); goto unlock_out; } - if (rnp_root != rnp && rnp_root->parent != NULL) - raw_spin_unlock_rcu_node(rnp_root); - if (!rnp_root->parent) + if (rnp != rnp_start && rnp->parent != NULL) + raw_spin_unlock_rcu_node(rnp); + if (!rnp->parent) break; /* At root, and perhaps also leaf. */ } /* If GP already in progress, just leave, otherwise start one. */ if (rcu_gp_in_progress(rsp)) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); rsp->gp_req_activity = jiffies; if (!rsp->gp_kthread) { - trace_rcu_this_gp(rnp_root, rdp, gp_seq_req, TPS("NoGPkthread")); + trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ - if (ULONG_CMP_LT(gp_seq_req, rnp_root->gp_seq_needed)) { - rnp->gp_seq_needed = rnp_root->gp_seq_needed; - rdp->gp_seq_needed = rnp_root->gp_seq_needed; + if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { + rnp_start->gp_seq_needed = rnp->gp_seq_needed; + rdp->gp_seq_needed = rnp->gp_seq_needed; } - if (rnp != rnp_root) - raw_spin_unlock_rcu_node(rnp_root); + if (rnp != rnp_start) + raw_spin_unlock_rcu_node(rnp); return ret; } -- cgit v1.2.3 From 226ca5e76692e2c82c17e8e8eedab22043f6ffee Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:15 -0700 Subject: rcu: Identify grace period is in progress as we advance up the tree There's no need to keep checking the same starting node for whether a grace period is in progress as we advance up the funnel lock loop. Its sufficient if we just checked it in the start, and then subsequently checked the internal nodes as we advanced up the combining tree. This also makes sense because the grace-period updates propogate from the root to the leaf, so there's a chance we may find a grace period has started as we advance up, lets check for the same. Reported-by: Paul McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1af58f4b8a25..a6863b813f0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1598,11 +1598,12 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, goto unlock_out; } rnp->gp_seq_needed = gp_seq_req; - if (rcu_seq_state(rcu_seq_current(&rnp_start->gp_seq))) { + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* - * We just marked the leaf, and a grace period - * is in progress, which means that rcu_gp_cleanup() - * will see the marking. Bail to reduce contention. + * We just marked the leaf or internal node, and a + * grace period is in progress, which means that + * rcu_gp_cleanup() will see the marking. Bail to + * reduce contention. */ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startedleaf")); -- cgit v1.2.3 From 962aff03c315b508d980422db5b49b49e4382119 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 12:49:21 -0700 Subject: rcu: Clean up handling of tasks blocked across full-rcu_node offline Commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start") deferred handling of CPU-hotplug events until the start of the next grace period, but consider the following sequence of events: 1. A task is preempted within an RCU-preempt read-side critical section. 2. The CPU that this task was running on goes offline, along with all other CPUs sharing the corresponding leaf rcu_node structure. 3. The task resumes execution. 4. One of those CPUs comes back online before a new grace period starts. In step 2, the code in the next rcu_gp_init() invocation will (correctly) defer removing the leaf rcu_node structure from the upper-level bitmasks, and will (correctly) set that structure's ->wait_blkd_tasks field. During the ensuing interval, RCU will (correctly) track the tasks preempted on that structure because they must block any subsequent grace period. In step 3, the code in rcu_read_unlock_special() will (correctly) remove the task from the leaf rcu_node structure. From this point forward, RCU need not pay attention to this structure, at least not until one of the corresponding CPUs comes back online. In step 4, the code in the next rcu_gp_init() invocation will (incorrectly) invoke rcu_init_new_rnp(). This is incorrect because the corresponding rcu_cleanup_dead_rnp() was never invoked. This is nevertheless harmless because the upper-level bits are still set. So, no harm, no foul, right? At least, all is well until a little further into rcu_gp_init() invocation, which will notice that there are no longer any tasks blocked on the leaf rcu_node structure, conclude that there is no longer anything left over from step 2's offline operation, and will therefore invoke rcu_cleanup_dead_rnp(). But this invocation of rcu_cleanup_dead_rnp() is for the beginning of the earlier offline interval, and the previous invocation of rcu_init_new_rnp() is for the end of that same interval. That is right, they are invoked out of order. That cannot be good, can it? It turns out that this is not a (correctness!) problem because rcu_cleanup_dead_rnp() checks to see if any of the corresponding CPUs are online, and refuses to do anything if so. In other words, in the case where rcu_init_new_rnp() and rcu_cleanup_dead_rnp() execute out of order, they both have no effect. But this is at best an accident waiting to happen. This commit therefore adds logic to rcu_gp_init() so that rcu_init_new_rnp() and rcu_cleanup_dead_rnp() are always invoked in order, and so that neither are invoked at all in cases where RCU had to pay attention to the leaf rcu_node structure during the entire time that all corresponding CPUs were offline. And, while in the area, this commit reduces confusion by using formal parameters rather than local variables that just happen to have the same value at that particular point in the code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6863b813f0c..9a5ba6db7b60 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1909,12 +1909,14 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ if (!oldmask != !rnp->qsmaskinit) { - if (!oldmask) /* First online CPU for this rcu_node. */ - rcu_init_new_rnp(rnp); - else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ - rnp->wait_blkd_tasks = true; - else /* Last offline CPU and can propagate. */ + if (!oldmask) { /* First online CPU for rcu_node. */ + if (!rnp->wait_blkd_tasks) /* Ever offline? */ + rcu_init_new_rnp(rnp); + } else if (rcu_preempt_has_tasks(rnp)) { + rnp->wait_blkd_tasks = true; /* blocked tasks */ + } else { /* Last offline CPU and can propagate. */ rcu_cleanup_dead_rnp(rnp); + } } /* @@ -1923,14 +1925,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) * still offline, propagate up the rcu_node tree and * clear ->wait_blkd_tasks. Otherwise, if one of this * rcu_node structure's CPUs has since come back online, - * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() - * checks for this, so just call it unconditionally). + * simply clear ->wait_blkd_tasks. */ if (rnp->wait_blkd_tasks && - (!rcu_preempt_has_tasks(rnp) || - rnp->qsmaskinit)) { + (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { rnp->wait_blkd_tasks = false; - rcu_cleanup_dead_rnp(rnp); + if (!rnp->qsmaskinit) + rcu_cleanup_dead_rnp(rnp); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2450,9 +2451,10 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) return; for (;;) { mask = rnp->grpmask; @@ -2461,7 +2463,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) break; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; - rnp->qsmask &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); if (rnp->qsmaskinit) { raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3479,6 +3482,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) struct rcu_node *rnp = rnp_leaf; raw_lockdep_assert_held_rcu_node(rnp); + WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; -- cgit v1.2.3 From c50cbe535c972150c2caf923239ef77e85c5ad60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 13:51:57 -0700 Subject: rcu: Fix an obsolete ->qsmaskinit comment Back in the old days, when grace-period initialization blocked CPU hotplug, the ->qsmaskinit mask was indeed updated at the time that a given CPU went offline. However, with the deferral of these updates until the beginning of the next grace period in commit 0aa04b055e71 ("rcu: Process offlining and onlining only at grace-period start"), it is instead ->qsmaskinitnext that gets updated at that time. This commit therefore updates the obsolete comment. It also fixes punctuation while on the topic of comments mentioning ->qsmaskinit. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a5ba6db7b60..05f69b787a57 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2438,7 +2438,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * This function therefore goes up the tree of rcu_node structures, * clearing the corresponding bits in the ->qsmaskinit fields. Note that * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated + * updated. * * This function does check that the specified rcu_node structure has * all CPUs offline and no blocked tasks, so it is OK to invoke it @@ -3709,7 +3709,7 @@ void rcu_cpu_starting(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit + * function. We now remove it from the rcu_node tree's ->qsmaskinitnext * bit masks. */ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -- cgit v1.2.3 From 8d672fa6bf68ffc36a0c5e4868499f86bbea2308 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 14:46:43 -0700 Subject: rcu: Make rcu_init_new_rnp() stop upon already-set bit Currently, rcu_init_new_rnp() walks up the rcu_node combining tree, setting bits in the ->qsmaskinit fields on the way up. It walks up unconditionally, regardless of the initial state of these bits. This is OK because only the corresponding RCU grace-period kthread ever tests or sets these bits during runtime. However, it is also pointless, and it increases both memory and lock contention (albeit only slightly), so this commit stops the walk as soon as an already-set bit is encountered. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 05f69b787a57..3fe854a15d82 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3479,9 +3479,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) { long mask; + long oldmask; struct rcu_node *rnp = rnp_leaf; - raw_lockdep_assert_held_rcu_node(rnp); + raw_lockdep_assert_held_rcu_node(rnp_leaf); WARN_ON_ONCE(rnp->wait_blkd_tasks); for (;;) { mask = rnp->grpmask; @@ -3489,8 +3490,11 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) if (rnp == NULL) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ + oldmask = rnp->qsmaskinit; rnp->qsmaskinit |= mask; raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ + if (oldmask) + return; } } -- cgit v1.2.3 From c74859d1eb2d8578bdf6d78ba893e394085aba1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Apr 2018 14:05:27 -0700 Subject: rcu: Make rcu_report_unblock_qs_rnp() warn on violated preconditions If rcu_report_unblock_qs_rnp() is invoked on something other than preemptible RCU or if there are still preempted tasks blocking the current grace period, something went badly wrong in the caller. This commit therefore adds WARN_ON_ONCE() to these conditions, but leaving the legitimate reason for early exit (rnp->qsmask != 0) unwarned. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3fe854a15d82..85417584ffd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2308,8 +2308,10 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || - rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || + WARN_ON_ONCE(rsp != rcu_state_p) || + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || + rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } -- cgit v1.2.3 From 77cfc7bf24ba0ba37be54e224007847d485a860f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:00:10 -0700 Subject: rcu: Fix typo and add additional debug This commit fixes a typo and adds some additional debugging to the message emitted when a task blocking the current grace period is listed as blocking it when either that grace period ends or the next grace period begins. This commit also reformats the console message for readability. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_plugin.h | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85417584ffd0..a4277c1087d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2316,6 +2316,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; /* Still need more quiescent states! */ } + rnp->completedqs = rnp->gp_seq; rnp_p = rnp->parent; if (rnp_p == NULL) { /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3a6e04103de1..677b0c9f548d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -856,8 +856,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) struct list_head *lhp; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks)); - pr_cont("\t->blkd_tasks"); + pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + rnp->gp_seq, rnp->completedqs); + pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", + __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); -- cgit v1.2.3 From 91f63ced7dc4e80acd13386204327d5de00a672d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 May 2018 15:05:45 -0700 Subject: rcu: Replace smp_wmb() with smp_store_release() for stall check This commit gets rid of the smp_wmb() in record_gp_stall_check_time() in favor of an smp_store_release(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4277c1087d9..439228b79811 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1246,9 +1246,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) unsigned long j1; rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - WRITE_ONCE(rsp->jiffies_stall, j + j1); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ rsp->jiffies_resched = j + j1 / 2; rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } -- cgit v1.2.3 From 928164351e700f91ab588f20fe470cac9db477a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 11:07:02 -0700 Subject: rcu: Prevent useless FQS scan after all CPUs have checked in The force_qs_rnp() function checks for ->qsmask being all zero, that is, all CPUs for the current rcu_node structure having already passed through quiescent states. But with RCU-preempt, this is not sufficient to report quiescent states further up the tree, so there are further checks that can initiate RCU priority boosting and also for races with CPU-hotplug operations. However, if neither of these further checks apply, the code proceeds to carry out a useless scan of an all-zero ->qsmask. This commit therefore adds code to release the current rcu_node structure's lock and continue on to the next rcu_node structure, thereby avoiding this useless scan. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 439228b79811..3efd591fcd15 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2672,6 +2672,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_report_unblock_qs_rnp() rlses ->lock */ continue; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; } for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); -- cgit v1.2.3 From 5554788e1d4253a92b794a9006b7ae2c10be52af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 10:14:34 -0700 Subject: rcu: Suppress false-positive offline-CPU lockdep-RCU splat The rcu_lockdep_current_cpu_online() function currently checks only the RCU-sched data structures to determine whether or not RCU believes that a given CPU is offline. Unfortunately, there are multiple flavors of RCU, which means that there is a short window of time during which the various flavors disagree as to whether or not a given CPU is offline. This can result in false-positive lockdep-RCU splats in which some other flavor of RCU tries to do something based on its view that the CPU is online, only to get hit with a lockdep-RCU splat because RCU-sched instead believes that the CPU is offline. This commit therefore changes rcu_lockdep_current_cpu_online() to scan all RCU flavors and to consider a given CPU to be online if any of the RCU flavors believe it to be online, thus preventing these false-positive splats. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3efd591fcd15..a7773fc72b0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1030,41 +1030,41 @@ void rcu_request_urgent_qs_task(struct task_struct *t) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* - * Is the current CPU online? Disable preemption to avoid false positives - * that could otherwise happen due to the current CPU number being sampled, - * this task being preempted, its old CPU being taken offline, resuming - * on some other CPU, then determining that its old CPU is now offline. - * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. Note also that it is OK - * for a CPU coming online to use RCU for one jiffy prior to marking itself - * online in the cpu_online_mask. Similarly, it is OK for a CPU going - * offline to continue to use RCU for one jiffy after marking itself - * offline in the cpu_online_mask. This leniency is necessary given the - * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the teardown - * of the CPU. + * Is the current CPU online as far as RCU is concerned? * - * This is also why RCU internally marks CPUs online during in the - * preparation phase and offline after the CPU has been taken down. + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. Because there are + * multiple flavors of RCU, and because this function can be called in the + * midst of updating the flavors while a given CPU coming online or going + * offline, it is necessary to check all flavors. If any of the flavors + * believe that given CPU is online, it is considered to be online. * - * Disable checking if in an NMI handler because we cannot safely report - * errors from NMI handlers anyway. + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. */ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - bool ret; + struct rcu_state *rsp; - if (in_nmi()) + if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); - rnp = rdp->mynode; - ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || - !rcu_scheduler_fully_active; + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { + preempt_enable(); + return true; + } + } preempt_enable(); - return ret; + return false; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -- cgit v1.2.3 From fece27760ff57a0ca16c37e144f6a58a2f1851e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 May 2018 20:04:12 -0700 Subject: rcu: Suppress false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given rcu_node structure go offline. A new grace period starts just after the CPU-hotplug code path does its synchronize_rcu() for the last CPU, so at least this CPU is present in that structure's ->qsmask. 2. Before the grace period ends, a CPU comes back online, and not just any CPU, but the one corresponding to a non-zero bit in the leaf rcu_node structure's ->qsmask. 3. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 4. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU was all the way offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. This commit eliminates this false positive by adding code to the end of rcu_cleanup_dying_idle_cpu() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -before- the clearing of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a7773fc72b0c..72adf97458e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3731,6 +3731,11 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ + /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit v1.2.3 From 99990da1b3c00f6c05a330e06a76b9dbc8416d7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 09:45:00 -0700 Subject: rcu: Suppress more involved false-positive preempted-task splats Consider the following sequence of events in a PREEMPT=y kernel: 1. All but one of the CPUs corresponding to a given leaf rcu_node structure go offline. Each of these CPUs clears its bit in that structure's ->qsmaskinitnext field. 2. A new grace period starts, and rcu_gp_init() scans the leaf rcu_node structures, applying CPU-hotplug changes since the start of the previous grace period, including those changes in #1 above. This copies each leaf structure's ->qsmaskinitnext to its ->qsmask field, which represents the CPUs that this new grace period will wait on. Each copy operation is done holding the corresponding leaf rcu_node structure's ->lock, and at the end of this scan, rcu_gp_init() holds no locks. 3. The last CPU corresponding to #1's leaf rcu_node structure goes offline, clearing its bit in that structure's ->qsmaskinitnext field, but not touching the ->qsmaskinit field. Note that rcu_gp_init() is not currently holding any locks! This CPU does -not- report a quiescent state because the grace period has not yet initialized itself sufficiently to have set any bits in any of the leaf rcu_node structures' ->qsmask fields. 4. The rcu_gp_init() function continues initializing the new grace period, copying each leaf rcu_node structure's ->qsmaskinit field to its ->qsmask field while holding the corresponding ->lock. This sets the ->qsmask bit corresponding to #3's CPU. 5. Before the grace period ends, #3's CPU comes back online. Because te grace period has not yet done any force-quiescent-state scans (which would report a quiescent state on behalf of any offline CPUs), this CPU's ->qsmask bit is still set. 6. A task running on the newly onlined CPU is preempted while in an RCU read-side critical section. Because this CPU's ->qsmask bit is net, not only does this task queue itself on the leaf rcu_node structure's ->blkd_tasks list, it also sets that structure's ->gp_tasks pointer to reference it. 7. The grace period started in #1 above comes to an end. This results in rcu_gp_cleanup() being invoked, which, among other things, checks to make sure that there are no tasks blocking the just-ended grace period, that is, that all ->gp_tasks pointers are NULL. The ->gp_tasks pointer corresponding to the task preempted in #3 above is non-NULL, which results in a splat. This splat is a false positive. The task's RCU read-side critical section cannot have begun before the just-ended grace period because this would mean either: (1) The CPU came online before the grace period started, which cannot have happened because the grace period started before that CPU went offline, or (2) The task started its RCU read-side critical section on some other CPU, but then it would have had to have been preempted before migrating to this CPU, which would mean that it would have instead queued itself on that other CPU's rcu_node structure. RCU's grace periods thus are working correctly. Or, more accurately, that remaining bugs in RCU's grace periods are elsewhere. This commit eliminates this false positive by adding code to the end of rcu_cpu_starting() that reports a quiescent state to RCU, which has the side-effect of clearing that CPU's ->qsmask bit, preventing the above scenario. This approach has the added benefit of more promptly reporting quiescent states corresponding to offline CPUs. Nevertheless, this commit does -not- remove the need for the force-quiescent-state scans to check for offline CPUs, given that a CPU might remain offline indefinitely. And without the checks in the force-quiescent-state scans, the grace period would also persist indefinitely, which could result in hangs or memory exhaustion. Note well that the call to rcu_report_qs_rnp() reporting the quiescent state must come -after- the setting of this CPU's bit in the leaf rcu_node structure's ->qsmaskinitnext field. Otherwise, lockdep-RCU will complain bitterly about quiescent states coming from an offline CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 72adf97458e3..6275ed3925e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3710,7 +3710,12 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -- cgit v1.2.3 From 0b107d24d9361132758374a7b007c7c74efa007f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 16:18:28 -0700 Subject: rcu: Suppress false-positive splats from mid-init task resume Consider the following sequence of events in a PREEMPT=y kernel: 1. All CPUs corresponding to a given leaf rcu_node structure are offline. 2. The first phase of the rcu_gp_init() function's grace-period initialization runs, and sets that rcu_node structure's ->qsmaskinit to zero, as it should. 3. One of the CPUs corresponding to that rcu_node structure comes back online. Note that because this CPU came online after the grace period started, this grace period can safely ignore this newly onlined CPU. 4. A task running on the newly onlined CPU enters an RCU-preempt read-side critical section, and is then preempted. Because the corresponding rcu_node structure's ->qsmask is zero, rcu_preempt_ctxt_queue() leaves the rcu_node structure's ->gp_tasks field NULL, as it should. 5. The rcu_gp_init() function continues running the second phase of grace-period initialization. The ->qsmask field of the parent of the aforementioned leaf rcu_node structure is set to not expect a quiescent state from the leaf, as is only right and proper. However, when rcu_gp_init() reaches the leaf, it invokes rcu_preempt_check_blocked_tasks(), which sees that the leaf's ->blkd_tasks list is non-empty, and therefore sets the leaf's ->gp_tasks field to reference the first task on that list. 6. The grace period ends before the preempted task resumes, which is perfectly fine, given that this grace period was under no obligation to wait for that task to exit its late-starting RCU-preempt read-side critical section. Unfortunately, the leaf's ->gp_tasks field is non-NULL, so rcu_gp_cleanup() splats. After all, it appears to rcu_gp_cleanup() that the grace period failed to wait for a task that was supposed to be blocking that grace period. This commit avoids this false-positive splat by adding a check of both ->qsmaskinit and ->wait_blkd_tasks to rcu_preempt_check_blocked_tasks(). If both ->qsmaskinit and ->wait_blkd_tasks are zero, then the task must have entered its RCU-preempt read-side critical section late (after all, the CPU that it is running on was not online at that time), which means that the upper-level rcu_node structure won't be waiting for anything on the leaf anyway. If ->wait_blkd_tasks is non-zero, then there is at least one task on ths rcu_node structure's ->blkd_tasks list whose RCU read-side critical section predates the current grace period. If ->qsmaskinit is non-zero, there is at least one CPU that was online at the start of the current grace period. Thus, if both are zero, there is nothing to wait for. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 677b0c9f548d..1c9a836af5b6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -703,7 +703,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); - if (rcu_preempt_has_tasks(rnp)) { + if (rcu_preempt_has_tasks(rnp) && + (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); -- cgit v1.2.3 From ec2c29765a4ab12c236ac5a89b89660222ff6b01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 09:34:17 -0700 Subject: rcu: Fix grace-period hangs from mid-init task resume Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. A task running on a given CPU is preempted in its RCU read-side critical section. 2. That CPU goes offline, and there are now no online CPUs corresponding to that CPU's leaf rcu_node structure. 3. The rcu_gp_init() function does the first phase of grace-period initialization, and sets the aforementioned leaf rcu_node structure's ->qsmaskinit field to all zeroes. Because there is a blocked task, it does not propagate the zeroing of either ->qsmaskinit or ->qsmaskinitnext up the rcu_node tree. 4. The task resumes on some other CPU and exits its critical section. There is no grace period in progress, so the resulting quiescent state is not reported up the tree. 5. The rcu_gp_init() function does the second phase of grace-period initialization, which results in the leaf rcu_node structure being initialized to expect no further quiescent states, but with that structure's parent expecting a quiescent-state report. The parent will never receive a quiescent state from this leaf rcu_node structure, so the grace period will hang, resulting in RCU CPU stall warnings. It would be good to get rid of the special fail-safe quiescent-state propagation checks. This commit therefore checks the leaf rcu_node structure's ->wait_blkd_tasks field during grace-period initialization. If this flag is set, the rcu_report_qs_rnp() is invoked to immediately report the possible quiescent state. While in the neighborhood, this commit also report quiescent states for any CPUs that went offline between the two phases of grace-period initialization, thus reducing grace-period delays and hopefully eventually allowing removal of offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6275ed3925e9..7f872721c54e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,6 +154,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); @@ -1858,7 +1861,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) */ static bool rcu_gp_init(struct rcu_state *rsp) { + unsigned long flags; unsigned long oldmask; + unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1951,7 +1956,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1962,7 +1967,12 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq_rcu_node(rnp); + /* Quiescent states for tasks on any now-offline CPUs. */ + mask = rnp->qsmask & ~rnp->qsmaskinitnext; + if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + else + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2233,6 +2243,10 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * is the grace-period snapshot, which means that the quiescent states * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. + * + * As a special case, if mask is zero, the bit-already-cleared check is + * disabled. This allows propagating quiescent state due to resumed tasks + * during grace-period initialization. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, @@ -2246,7 +2260,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask) || rnp->gp_seq != gps) { + if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { /* * Our bit has already been cleared, or the -- cgit v1.2.3 From 1e64b15a4b102e1cd059d4d798b7a78f93341333 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 19:23:09 -0700 Subject: rcu: Fix grace-period hangs due to race with CPU offline Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. CPU 1 goes offline. 2. Because CPU 1 is the only CPU in the system blocking the current grace period, the grace period ends as soon as rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp() returns. 3. At this point, the leaf rcu_node structure's ->lock is no longer held: rcu_report_qs_rnp() has released it, as it must in order to awaken the RCU grace-period kthread. 4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext field still records CPU 1 as being online. This is absolutely necessary because the scheduler uses RCU (in this case on the wake-up path while awakening RCU's grace-period kthread), and ->qsmaskinitnext contains RCU's idea as to which CPUs are online. Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's bit from ->qsmaskinitnext would result in a lockdep-RCU splat due to RCU being used from an offline CPU. 5. RCU's grace-period kthread awakens, sees that the old grace period has completed and that a new one is needed. It therefore starts a new grace period, but because CPU 1's leaf rcu_node structure's ->qsmaskinitnext field still shows CPU 1 as being online, this new grace period is initialized to wait for a quiescent state from the now-offline CPU 1. 6. Without the fail-safe force-quiescent-state checks, there would be no quiescent state from the now-offline CPU 1, which would eventually result in RCU CPU stall warnings and memory exhaustion. It would be good to get rid of the special fail-safe quiescent-state propagation checks, and thus it would be good to fix things so that the above scenario cannot happen. This commit therefore adds a new ->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init() across the applying of buffered online and offline operations to the rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu() when buffering a new offline operation. This prevents rcu_gp_init() from acquiring the leaf rcu_node structure's lock during the interval between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(), which releases ->lock and the re-acquisition of that same lock. This in turn prevents the failure scenario outlined above, and will hopefully eventually allow removal of the offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7f872721c54e..50e4f7ebf8cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,6 +101,7 @@ struct rcu_state sname##_state = { \ .abbr = sabbr, \ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ + .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -1900,11 +1901,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); + spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); continue; } @@ -1940,6 +1943,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) } raw_spin_unlock_irq_rcu_node(rnp); + spin_unlock(&rsp->ofl_lock); } /* @@ -3749,6 +3753,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ @@ -3757,6 +3762,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + spin_unlock(&rsp->ofl_lock); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3def94fc9c74..6683da6e4ecc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -363,6 +363,10 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + + spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + /* Synchronize offline with */ + /* GP pre-initialization. */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit v1.2.3 From 1f3e5f51b933cbc25e3da0cdbdac40716df04ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 10:35:33 -0700 Subject: rcu: Add RCU-preempt check for waiting on newly onlined CPU RCU should only be waiting on CPUs that were online at the time that the current grace period started. Failure to abide by this rule can result in confusing splats during grace-period cleanup and initialization. This commit therefore adds a check to RCU-preempt's preempted-task queuing that checks for waiting on newly onlined CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1c9a836af5b6..6b85ce936ad4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -183,6 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); + /* RCU better not be waiting on newly onlined CPUs! */ + WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & + rdp->grpmask); /* * Decide where to queue the newly blocked task. In theory, -- cgit v1.2.3 From f34f2f5852e556ee1c3b3b294571086b1791008a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 13:40:25 -0700 Subject: rcu: Move grace-period pre-init delay after pre-init The main race with the early part of grace-period initialization appears to be with CPU hotplug. To more fully open this race window, this commit moves the rcu_gp_slow() from the beginning of the early initialization loop to follow that loop, thus widening the race window, especially for the rcu_node structures that are initialized last. This commit also expands rcutree.gp_preinit_delay from 3 to 12, giving the same overall delay in the grace period, but concentrated in the spot where it will do the most good. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50e4f7ebf8cf..c577cadcc4f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1900,7 +1900,6 @@ static bool rcu_gp_init(struct rcu_state *rsp) * will handle subsequent offline CPUs. */ rcu_for_each_leaf_node(rsp, rnp) { - rcu_gp_slow(rsp, gp_preinit_delay); spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1945,6 +1944,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq_rcu_node(rnp); spin_unlock(&rsp->ofl_lock); } + rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node -- cgit v1.2.3 From 17a8212b8de21ce1b9a91fb75c8a6fb337685b9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 May 2018 14:30:02 -0700 Subject: rcu: Remove failsafe check for lost quiescent state Now that quiescent-state reporting is fully event-driven, this commit removes the check for a lost quiescent state from force_qs_rnp(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c577cadcc4f8..770f0df54015 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2317,8 +2317,9 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +static void __maybe_unused +rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2679,17 +2680,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) /* rcu_initiate_boost() releases rnp->lock */ continue; } - if (rnp->parent && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period - * initialization and task exiting RCU - * read-side critical section: Report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); - /* rcu_report_unblock_qs_rnp() rlses ->lock */ - continue; - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } -- cgit v1.2.3 From e05121ba5b81e2f85349f038642410578457f6db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 May 2018 12:07:48 -0700 Subject: rcu: Remove CPU-hotplug failsafe from force-quiescent-state code path Now that quiescent states for newly offlined CPUs are reported either when that CPU goes offline or at the end of grace-period initialization, the CPU-hotplug failsafe in the force-quiescent-state code path is no longer needed. This commit therefore removes this failsafe. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +-------- kernel/rcu/tree.h | 1 - 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 770f0df54015..5f1a11f1f7bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,14 +1188,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } - /* Check for the CPU being offline. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("ofl")); - rdp->offline_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; - } - /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -3718,6 +3710,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6683da6e4ecc..795d469c6f67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -217,7 +217,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long cond_resched_completed; /* Grace period that needs help */ /* from cond_resched(). */ -- cgit v1.2.3 From ff3cee39088b1931a432587059d66cd505f785dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 12:50:14 -0700 Subject: rcu: Add up-tree information to dump_blkd_tasks() diagnostics This commit updates dump_blkd_tasks() to print out quiescent-state bitmasks for the rcu_node structures further up the tree. This information helps debugging of interactions between CPU-hotplug operations and RCU grace-period initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6b85ce936ad4..f45ff97b0d51 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -858,13 +858,15 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { int i; struct list_head *lhp; + struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); - pr_info("%s: grp: %d-%d level: %d ->gp_seq %#lx ->completedqs %#lx\n", + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - rnp->gp_seq, rnp->completedqs); - pr_info("%s: ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", - __func__, rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext); + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); pr_info("%s: ->blkd_tasks", __func__); -- cgit v1.2.3 From 577389423187d8b51dfe6199297e579a3419b72b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 14:18:57 -0700 Subject: rcu: Add CPU online/offline state to dump_blkd_tasks() Interactions between CPU-hotplug operations and grace-period initialization can result in dump_blkd_tasks(). One of the first debugging actions in this case is to search back in dmesg to work out which of the affected rcu_node structure's CPUs are online and to determine the last CPU-hotplug operation affecting any of those CPUs. This can be laborious and error-prone, especially when console output is lost. This commit therefore causes dump_blkd_tasks() to dump the state of the affected rcu_node structure's CPUs and the last grace period during which the last offline and online operation affected each of these CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++-- kernel/rcu/tree.h | 12 +++++++++--- kernel/rcu/tree_plugin.h | 25 ++++++++++++++++++++----- 3 files changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f1a11f1f7bc..a2503ef1bbe2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1954,7 +1954,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rnp); + rcu_preempt_check_blocked_tasks(rsp, rnp); rnp->qsmask = rnp->qsmaskinit; WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); if (rnp == rdp->mynode) @@ -2063,7 +2063,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); rdp = this_cpu_ptr(rsp->rda); @@ -3516,6 +3516,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); + rdp->rcu_ofl_gp_seq = rsp->gp_seq; + rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -3711,6 +3715,8 @@ void rcu_cpu_starting(unsigned int cpu) /* Allow lockless access for expedited grace periods. */ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); @@ -3738,6 +3744,8 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; spin_lock(&rsp->ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 795d469c6f67..f52bc059bfec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -255,12 +255,16 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) RCU CPU stall data. */ + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ + unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ + short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ + short rcu_onl_gp_flags; /* ->gp_flags at last online. */ int cpu; struct rcu_state *rsp; @@ -431,11 +435,13 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, + struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, + int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f45ff97b0d51..613372246a07 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -699,13 +699,14 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rnp, 10); + dump_blkd_tasks(rsp, rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; @@ -854,10 +855,14 @@ void exit_rcu(void) * Dump the blocked-tasks state, but limit the list dump to the * specified number of elements. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { + int cpu; int i; struct list_head *lhp; + bool onl; + struct rcu_data *rdp; struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); @@ -877,6 +882,14 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) break; } pr_cont("\n"); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + rdp = per_cpu_ptr(rsp->rda, cpu); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", + cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + } } #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -949,7 +962,8 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +static void +rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } @@ -990,7 +1004,8 @@ void exit_rcu(void) /* * Dump the guaranteed-empty blocked-tasks state. Trust but verify. */ -static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck) +static void +dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) { WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } -- cgit v1.2.3 From fea3f222d3523dfdd0e86b11227d3cda20765102 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 15:47:30 -0700 Subject: rcu: Record ->gp_state for both phases of grace-period initialization Grace-period initialization first processes any recent CPU-hotplug operations, and then initializes state for the new grace period. These two phases of initialization are currently not distinguished in debug prints, but the distinction is valuable in a number of debug situations. This commit therefore introduces two new values for ->gp_state, RCU_GP_ONOFF and RCU_GP_INIT, in order to make this distinction. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2503ef1bbe2..ee218d743226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1891,6 +1891,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ + rsp->gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rsp, rnp) { spin_lock(&rsp->ofl_lock); raw_spin_lock_irq_rcu_node(rnp); @@ -1950,6 +1951,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ + rsp->gp_state = RCU_GP_INIT; rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f52bc059bfec..8077aff7ab40 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -380,16 +380,20 @@ struct rcu_state { #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ -#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ -#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ -#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */ +#define RCU_GP_INIT 4 /* Grace-period initialization. */ +#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ #ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", "RCU_GP_DONE_GPS", + "RCU_GP_ONOFF", + "RCU_GP_INIT", "RCU_GP_WAIT_FQS", "RCU_GP_DOING_FQS", "RCU_GP_CLEANUP", -- cgit v1.2.3 From f2e2df59786d7bd52e6e7e2d10c1c6ba433a0ee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 May 2018 16:23:23 -0700 Subject: rcu: Add diagnostics for offline CPUs failing to report QS CPUs are expected to report quiescent states when coming online and when going offline, and grace-period initialization is supposed to handle any race conditions where a CPU's ->qsmask bit is set just after it goes offline. This commit adds diagnostics for the case where an offline CPU nevertheless has a grace period waiting on it. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 ++++++++++++++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ee218d743226..d3333ee2c6f5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1188,6 +1188,27 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) smp_store_release(ruqp, true); } + /* If waiting too long on an offline CPU, complain. */ + if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && + time_after(jiffies, rdp->rsp->gp_start + HZ)) { + bool onl; + struct rcu_node *rnp1; + + WARN_ON(1); /* Offline CPUs are supposed to report QS! */ + pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", + __func__, rnp->grplo, rnp->grphi, rnp->level, + (long)rnp->gp_seq, (long)rnp->completedqs); + for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) + pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", + __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); + onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); + pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", + __func__, rdp->cpu, ".o"[onl], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + return 1; /* Break things loose after complaining. */ + } + /* * A CPU running for an extended time within the kernel can * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, @@ -1967,6 +1988,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); /* Quiescent states for tasks on any now-offline CPUs. */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; + rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); else diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8077aff7ab40..d51e6edc8e83 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -90,6 +90,7 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ + unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ -- cgit v1.2.3 From 3949fa9bac090ad217534c30bc3b6572289abf21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 15:29:10 -0700 Subject: rcu: Make rcu_read_unlock_special() static Because rcu_read_unlock_special() is no longer used outside of kernel/rcu/tree_plugin.h, this commit makes it static. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 613372246a07..54a251640f53 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -127,6 +127,7 @@ static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); +static void rcu_read_unlock_special(struct task_struct *t); /* * Tell them what RCU they are running. @@ -461,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -void rcu_read_unlock_special(struct task_struct *t) +static void rcu_read_unlock_special(struct task_struct *t) { bool empty_exp; bool empty_norm; -- cgit v1.2.3 From 07f27570dcd148a5f4de7dc3513c1d1cd069b362 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 11 May 2018 17:30:34 +0900 Subject: rcu: Improve rcu_note_voluntary_context_switch() reporting We expect a quiescent state of TASKS_RCU when cond_resched_tasks_rcu_qs() is called, no matter whether it actually be scheduled or not. However, it currently doesn't report the quiescent state when the task enters into __schedule() as it's called with preempt = true. So make it report the quiescent state unconditionally when cond_resched_tasks_rcu_qs() is called. And in TINY_RCU, even though the quiescent state of rcu_bh also should be reported when the tick interrupt comes from user, it doesn't. So make it reported. Lastly in TREE_RCU, rcu_note_voluntary_context_switch() should be reported when the tick interrupt comes from not only user but also idle, as an extended quiescent state. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney [ paulmck: Simplify rcutiny portion given no RCU-tasks for !PREEMPT. ] --- kernel/rcu/tiny.c | 4 +--- kernel/rcu/tree.c | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index a64eee0db39e..befc9321a89c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -122,10 +122,8 @@ void rcu_check_callbacks(int user) { if (user) rcu_sched_qs(); - else if (!in_softirq()) + if (user || !in_softirq()) rcu_bh_qs(); - if (user) - rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..19beabe73629 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2645,6 +2645,7 @@ void rcu_check_callbacks(int user) rcu_sched_qs(); rcu_bh_qs(); + rcu_note_voluntary_context_switch(current); } else if (!in_softirq()) { @@ -2660,8 +2661,7 @@ void rcu_check_callbacks(int user) rcu_preempt_check_callbacks(); if (rcu_pending()) invoke_rcu_core(); - if (user) - rcu_note_voluntary_context_switch(current); + trace_rcu_utilization(TPS("End scheduler-tick")); } -- cgit v1.2.3 From a7538352da722fae5cc95ae6656ea2013f5b8b21 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 May 2018 13:27:33 -0700 Subject: rcu: Use pr_fmt to prefix "rcu: " to logging output This commit also adjusts some whitespace while in the area. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney [ paulmck: Revert string-breaking %s as requested by Andy Shevchenko. ] --- kernel/rcu/rcuperf.c | 7 +++---- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/srcutree.c | 5 ++++- kernel/rcu/tree.c | 8 +++++--- kernel/rcu/tree_plugin.h | 10 ++++++---- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..00e395c0d7d0 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -680,12 +680,11 @@ rcu_perf_init(void) break; } if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", - perf_type); + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); pr_alert("rcu-perf types:"); for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_alert(" %s", perf_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..90a94fecdd73 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1755,8 +1755,8 @@ rcu_torture_init(void) torture_type); pr_alert("rcu-torture types:"); for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); + pr_cont(" %s", torture_ops[i]->name); + pr_cont("\n"); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d6d6ea9738c0..e526b56998af 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -390,7 +392,8 @@ void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p state: %d\n", + __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); return; /* Caller forgot to stop doing call_srcu()? */ } free_percpu(sp->sda); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 19beabe73629..6f2922168216 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -27,6 +27,9 @@ * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ + +#define pr_fmt(fmt) "rcu: " fmt + #include #include #include @@ -1374,8 +1377,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", - rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4048,7 +4050,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", + pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 54a251640f53..dbfe90191e19 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -74,8 +74,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - RCU_FANOUT); + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", + RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) @@ -88,11 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); if (rcu_fanout_leaf != RCU_FANOUT_LEAF) - pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", + rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST - pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", + kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif if (blimit != DEFAULT_RCU_BLIMIT) pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); -- cgit v1.2.3 From 6f56f714db067056c80f5d71510118f82872e34c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 May 2018 13:52:27 -0700 Subject: rcu: Improve RCU-tasks naming and comments The naming and comments associated with some RCU-tasks code make the faulty assumption that context switches due to cond_resched() are voluntary. As several people pointed out, this is not the case. This commit therefore updates function names and comments to better reflect current reality. Reported-by: Byungchul Park Reported-by: Joel Fernandes Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/update.c | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f2922168216..ccc061acf887 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -457,7 +457,7 @@ void rcu_note_context_switch(bool preempt) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_dynticks.rcu_qs_ctr); if (!preempt) - rcu_note_voluntary_context_switch_lite(current); + rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c230a60ece4..5783bdf86e5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -507,14 +507,15 @@ early_initcall(check_cpu_stall_init); #ifdef CONFIG_TASKS_RCU /* - * Simple variant of RCU whose quiescent states are voluntary context switch, - * user-space execution, and idle. As such, grace periods can take one good - * long time. There are no read-side primitives similar to rcu_read_lock() - * and rcu_read_unlock() because this implementation is intended to get - * the system into a safe state for some of the manipulations involved in - * tracing and the like. Finally, this implementation does not support - * high call_rcu_tasks() rates from multiple CPUs. If this is required, - * per-CPU callback lists will be needed. + * Simple variant of RCU whose quiescent states are voluntary context + * switch, cond_resched_rcu_qs(), user-space execution, and idle. + * As such, grace periods can take one good long time. There are no + * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() + * because this implementation is intended to get the system into a safe + * state for some of the manipulations involved in tracing and the like. + * Finally, this implementation does not support high call_rcu_tasks() + * rates from multiple CPUs. If this is required, per-CPU callback lists + * will be needed. */ /* Global list of callbacks and associated lock. */ @@ -542,11 +543,11 @@ static struct task_struct *rcu_tasks_kthread_ptr; * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks() assumes * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. -- cgit v1.2.3 From 15651201fa055ec81d3669b36ab7c2fb12c3ce36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 14:41:41 -0700 Subject: rcu: Mark task as .need_qs less aggressively If any scheduling-clock interrupt interrupts an RCU-preempt read-side critical section, the interrupted task's ->rcu_read_unlock_special.b.need_qs field is set. This causes the outermost rcu_read_unlock() to incur the extra overhead of calling into rcu_read_unlock_special(). This commit reduces that overhead by setting ->rcu_read_unlock_special.b.need_qs only if the grace period has been in effect for more than one second. Why one second? Because this is comfortably smaller than the minimum RCU CPU stall-warning timeout of three seconds, but long enough that the .need_qs marking should happen quite rarely. And if your RCU read-side critical section has run on-CPU for a full second, it is not unreasonable to invest some CPU time in ending the grace period quickly. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dbfe90191e19..0239cf8a4be6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -730,6 +730,7 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) */ static void rcu_preempt_check_callbacks(void) { + struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { @@ -738,7 +739,9 @@ static void rcu_preempt_check_callbacks(void) } if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + !t->rcu_read_unlock_special.b.need_qs && + time_after(jiffies, rsp->gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } -- cgit v1.2.3 From 3b57a3994f330a102b91bd70c84c9530c0ae50f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 16:01:56 -0700 Subject: rcu: Inline rcu_dynticks_momentary_idle() into its sole caller The rcu_dynticks_momentary_idle() function is invoked only from rcu_momentary_dyntick_idle(), and neither function is particularly large. This commit therefore saves a few lines by inlining rcu_dynticks_momentary_idle() into rcu_momentary_dyntick_idle(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ccc061acf887..ebdbb5f96e5c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -385,20 +385,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) return snap != rcu_dynticks_snap(rdtp); } -/* - * Do a double-increment of the ->dynticks counter to emulate a - * momentary idle-CPU quiescent state. - */ -static void rcu_dynticks_momentary_idle(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &rdtp->dynticks); - - /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); -} - /* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the @@ -430,12 +416,17 @@ bool rcu_eqs_special_set(int cpu) * * We inform the RCU core by emulating a zero-duration dyntick-idle period. * - * The caller must have disabled interrupts. + * The caller must have disabled interrupts and must not be idle. */ static void rcu_momentary_dyntick_idle(void) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int special; + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - rcu_dynticks_momentary_idle(); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* It is illegal to call this from idle state. */ + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } /* -- cgit v1.2.3 From c7037ff5249cee237b8c8a6f99998ae4f3916b60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 18:00:17 -0700 Subject: rcu: Clarify and correct the rcu_preempt_qs() header comment The rcu_preempt_qs() function only applies to the CPU, not the task. A task really is allowed to invoke this function while in an RCU-preempt read-side critical section, but only if it has first added itself to some leaf rcu_node structure's ->blkd_tasks list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0239cf8a4be6..07d1ad175994 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -294,13 +294,17 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * not in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. + * Record a preemptible-RCU quiescent state for the specified CPU. + * Note that this does not necessarily mean that the task currently running + * on the CPU is in a quiescent state: Instead, it means that the current + * grace period need not wait on any RCU read-side critical section that + * starts later on this CPU. It also means that if the current task is + * in an RCU read-side critical section, it has already added itself to + * some leaf rcu_node structure's ->blkd_tasks list. In addition to the + * current task, there might be any number of other tasks blocked while + * in an RCU read-side critical section. * - * As with the other rcu_*_qs() functions, callers to this function - * must disable preemption. + * Callers to this function must disable preemption. */ static void rcu_preempt_qs(void) { -- cgit v1.2.3 From 164ba3fc4864346dbc365f8b89d8888e1b6cd38c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 20:41:36 -0700 Subject: rcu: Remove unused rcu_kick_nohz_cpu() function The rcu_kick_nohz_cpu() function is no longer used, and the functionality it used to provide is now provided by a call to resched_cpu() in the force-quiescent-state function rcu_implicit_dynticks_qs(). This commit therefore removes rcu_kick_nohz_cpu(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d51e6edc8e83..4e74df768c57 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -483,7 +483,6 @@ static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 07d1ad175994..75a91d58b8f7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2645,23 +2645,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -/* - * An adaptive-ticks CPU can potentially execute in kernel mode for an - * arbitrarily long period of time with the scheduling-clock tick turned - * off. RCU will be paying attention to this CPU because it is in the - * kernel, but the CPU cannot be guaranteed to be executing the RCU state - * machine because the scheduling-clock tick has been disabled. Therefore, - * if an adaptive-ticks CPU is failing to respond to the current grace - * period and has not be idle from an RCU perspective, kick it. - */ -static void __maybe_unused rcu_kick_nohz_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(cpu)) - smp_send_reschedule(cpu); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ -} - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit v1.2.3 From ab6b82147f471e31d9397c95d523631b6c8953f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:21:20 -0700 Subject: rcu: Remove unused local variable "cpu" One danger of using __maybe_unused is that the compiler doesn't yell at you when you remove the last reference, witness rcu_bind_gp_kthread() and its local variable "cpu". This commit removes this local variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75a91d58b8f7..2cc9bf0d363a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2670,8 +2670,6 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) */ static void rcu_bind_gp_kthread(void) { - int __maybe_unused cpu; - if (!tick_nohz_full_enabled()) return; housekeeping_affine(current, HK_FLAG_RCU); -- cgit v1.2.3 From 95394e69c42f0da83a176936fdb28f6cac57ea69 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from panic_on_rcu_stall() and rcu_blocking_is_gp() These functions are in kernel/rcu/tree.c, which is not an include file, so there is no problem dropping the "inline", especially given that these functions are nowhere near a fastpath. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ebdbb5f96e5c..3504ee35e226 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1342,7 +1342,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } -static inline void panic_on_rcu_stall(void) +static void panic_on_rcu_stall(void) { if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); @@ -3080,7 +3080,7 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. */ -static inline int rcu_blocking_is_gp(void) +static int rcu_blocking_is_gp(void) { int ret; -- cgit v1.2.3 From eac45e586cd38a1b56aa716560002e68741b78a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_torture_print_module_parms() This function is in rcutorture.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcutorture run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 90a94fecdd73..57a4277ccc63 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1359,7 +1359,7 @@ rcu_torture_stats(void *arg) return 0; } -static inline void +static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG -- cgit v1.2.3 From 9622179519c52ead944c3b6a07aed9c6db3659e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:33:17 -0700 Subject: rcu: Remove "inline" from rcu_perf_print_module_parms() This function is in rcuperf.c, which is not an include file, so there is no problem dropping the "inline", especially given that this function is invoked only twice per rcuperf run. This commit therefore delegates the inlining decision to the compiler by dropping the "inline". Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 00e395c0d7d0..3e86940245d9 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -536,7 +536,7 @@ retry: return 0; } -static inline void +static void rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG -- cgit v1.2.3 From 51fbb910f52c8559a78665d203e55ab2b95e7126 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:15:40 -0700 Subject: rcu: Remove __maybe_unused from rcu_cpu_has_callbacks() The rcu_cpu_has_callbacks() function is now used in all configurations, so this commit removes the __maybe_unused. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3504ee35e226..cdc4fca0c4cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3315,7 +3315,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- cgit v1.2.3 From b06ae25a1e2b54b2b5bc589a4a118b7bb39159fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 13:32:51 -0700 Subject: rcu: Use RCU CPU stall timeout for rcu_check_gp_start_stall() Currently, rcu_check_gp_start_stall() waits for one second after the first request before complaining that a grace period has not yet started. This was desirable while testing the conversion from ->future_gp_needed[] to ->gp_seq_needed, but it is a bit on the hair-trigger side for production use under heavy load. This commit therefore makes this wait time be exactly that of the RCU CPU stall warning, allowing easy adjustment of both timeouts to suit the distribution or installation at hand. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cdc4fca0c4cb..7746fe1ee3fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2753,6 +2753,7 @@ static void rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; struct rcu_node *rnp_root = rcu_get_root(rsp); @@ -2762,8 +2763,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) return; @@ -2771,8 +2772,8 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || - time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || + time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || atomic_read(&warned)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -2784,18 +2785,18 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, j = jiffies; if (rcu_gp_in_progress(rsp) || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rsp->gp_req_activity + HZ) || - time_before(j, rsp->gp_activity + HZ) || + time_before(j, rsp->gp_req_activity + gpssdelay) || + time_before(j, rsp->gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", + pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", __func__, (long)READ_ONCE(rsp->gp_seq), (long)READ_ONCE(rnp_root->gp_seq_needed), j - rsp->gp_req_activity, j - rsp->gp_activity, - rsp->gp_flags, rsp->name, + rsp->gp_flags, rsp->gp_state, rsp->name, rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) -- cgit v1.2.3 From 0d805a70a652a6eef8d0283e5183879e7acb85ad Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 22 May 2018 23:38:13 -0700 Subject: rcu: Add comment documenting how rcu_seq_snap works rcu_seq_snap may be tricky to decipher. Lets document how it works with an example to make it easier. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney [ paulmck: Shrink comment as suggested by Peter Zijlstra. ] --- kernel/rcu/rcu.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..89f13fffac73 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -91,7 +91,17 @@ static inline void rcu_seq_end(unsigned long *sp) WRITE_ONCE(*sp, rcu_seq_endval(sp)); } -/* Take a snapshot of the update side's sequence number. */ +/* + * rcu_seq_snap - Take a snapshot of the update side's sequence number. + * + * This function returns the earliest value of the grace-period sequence number + * that will indicate that a full grace period has elapsed since the current + * time. Once the grace-period sequence number has reached this value, it will + * be safe to invoke all callbacks that have been registered prior to the + * current time. This value is the current grace-period number plus two to the + * power of the number of low-order bits reserved for state, then rounded up to + * the next value in which the state bits are all zero. + */ static inline unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; -- cgit v1.2.3 From c03be752d39dc64dcfda0ac8ce87fb10b1ee5621 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:49:46 -0400 Subject: rcu: Speed up calling of RCU tasks callbacks Joel Fernandes found that the synchronize_rcu_tasks() was taking a significant amount of time. He demonstrated it with the following test: # cd /sys/kernel/tracing # while [ 1 ]; do x=1; done & # echo '__schedule_bug:traceon' > set_ftrace_filter # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m1.064s user 0m0.000s sys 0m0.004s Where it takes a little over a second to perform the synchronize, because there's a loop that waits 1 second at a time for tasks to get through their quiescent points when there's a task that must be waited for. After discussion we came up with a simple way to wait for holdouts but increase the time for each iteration of the loop but no more than a full second. With the new patch we have: # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m0.131s user 0m0.000s sys 0m0.004s Which drops it down to 13% of what the original wait time was. Link: http://lkml.kernel.org/r/20180523063815.198302-2-joel@joelfernandes.org Reported-by: Joel Fernandes (Google) Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5783bdf86e5a..4c7c49c106ee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -668,6 +668,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_FLAG_RCU); @@ -749,13 +750,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * holdouts. When the list is empty, we are done. */ lastreport = jiffies; - while (!list_empty(&rcu_tasks_holdouts)) { + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { bool firstreport; bool needreport; int rtst; struct task_struct *t1; - schedule_timeout_interruptible(HZ); + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); -- cgit v1.2.3 From cd23ac8ddb7be993f88bee893b89a8b4971c3651 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:58:16 -0400 Subject: rcu: Add comment to the last sleep in the rcu tasks loop At the end of rcu_tasks_kthread() there's a lonely schedule_timeout_uninterruptible() call with no apparent rationale for its existence. But there is. It is to keep the thread from going into a tight loop if there's some anomaly. That really needs a comment. Link: http://lkml.kernel.org/r/20180524223839.GU3803@linux.vnet.ibm.com Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c7c49c106ee..39cb23d22109 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -814,6 +814,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) list = next; cond_resched(); } + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_uninterruptible(HZ/10); } } -- cgit v1.2.3 From 47199a0812535217c29933cecf468568bb37f933 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 May 2018 10:33:08 -0700 Subject: rcu: Add diagnostics for rcutorture writer stall warning This commit adds any in-the-future ->gp_seq_needed fields to the diagnostics for an rcutorture writer stall warning message. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7746fe1ee3fc..4915525559ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -606,11 +606,32 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); */ void show_rcu_gp_kthreads(void) { + int cpu; + struct rcu_data *rdp; + struct rcu_node *rnp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { pr_info("%s: wait state: %d ->state: %#lx\n", rsp->name, rsp->gp_state, rsp->gp_kthread->state); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rsp->gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); + } + } /* sched_show_task(rsp->gp_kthread); */ } } -- cgit v1.2.3 From 67abb96cbf307e16e3c6d1a0328ece085b5ce94c Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 1 Jun 2018 11:03:09 +0900 Subject: rcu: Check the range of jiffies_till_{first,next}_fqs when setting them Currently, the range of jiffies_till_{first,next}_fqs are checked and adjusted on and on in the loop of rcu_gp_kthread on runtime. However, it's enough to check them only when setting them, not every time in the loop. So make them handled on a setting time via sysfs. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4915525559ac..7498a416f63b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -510,8 +510,38 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); +static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + return ret; +} + +static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) +{ + ulong j; + int ret = kstrtoul(val, 0, &j); + + if (!ret) + WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + return ret; +} + +static struct kernel_param_ops first_fqs_jiffies_ops = { + .set = param_set_first_fqs_jiffies, + .get = param_get_ulong, +}; + +static struct kernel_param_ops next_fqs_jiffies_ops = { + .set = param_set_next_fqs_jiffies, + .get = param_get_ulong, +}; + +module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644); +module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); /* @@ -2180,10 +2210,6 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle quiescent-state forcing. */ first_gp_fqs = true; j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } ret = 0; for (;;) { if (!ret) { @@ -2218,13 +2244,6 @@ static int __noreturn rcu_gp_kthread(void *arg) WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } } else { /* Deal with stray signal. */ cond_resched_tasks_rcu_qs(); -- cgit v1.2.3 From 2ee5aca54622aacc196106c623fea4116f1043a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 9 Jun 2018 01:22:20 -0700 Subject: rcu: Make rcu_seq_diff() more exact The current implementatation of rcu_seq_diff() follows tradition in providing a rough-and-ready approximation of the number of elapsed grace periods between the two rcu_seq values. However, this difference is used to flag RCU-failure "near misses", which can be a valuable debugging aid, so more exactitude would be an improvement. This commit therefore improves the accuracy of rcu_seq_diff(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 89f13fffac73..d5e0294b8580 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -158,7 +158,20 @@ static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new) */ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) { - return (new - old) >> RCU_SEQ_CTR_SHIFT; + unsigned long rnd_diff; + + if (old == new) + return 0; + /* + * Compute the number of grace periods (still shifted up), plus + * one if either of new and old is not an exact grace period. + */ + rnd_diff = (new & ~RCU_SEQ_STATE_MASK) - + ((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK) + + ((new & RCU_SEQ_STATE_MASK) || (old & RCU_SEQ_STATE_MASK)); + if (ULONG_CMP_GE(RCU_SEQ_STATE_MASK, rnd_diff)) + return 1; /* Definitely no grace period has elapsed. */ + return ((rnd_diff - RCU_SEQ_STATE_MASK - 1) >> RCU_SEQ_CTR_SHIFT) + 2; } /* -- cgit v1.2.3 From 89b4cd4b9ebf15b01f70b85105ea5f5c6b2a3788 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Jun 2018 12:29:16 -0700 Subject: rcu: Print stall-warning NMI dyntick state in hexadecimal The ->dynticks_nmi_nesting field records the nesting depth of both interrupt and NMI handlers. Because the kernel can enter interrupts and never leave them (and vice versa) and because NMIs can interrupt manipulation of the ->dynticks_nmi_nesting field, the values in this field must be both chosen and maniupated very carefully. As a result, although the value is zero when the corresponding CPU is executing neither an interrupt nor an NMI handler, it is 4,611,686,018,427,387,906 on 64-bit systems when there is a single level of interrupt/NMI handling in progress. This number is difficult to remember and interpret, so this commit switches the output to hexadecimal, resulting in the much nicer 0x4000000000000002. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2cc9bf0d363a..c1b17f5b9361 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1801,7 +1801,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -- cgit v1.2.3 From 52e17ba1d063ab6adb367f288babd380e30bad46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2018 08:54:37 -0700 Subject: srcu: Add grace-period number to rcutorture statistics printout This commit adds the SRCU grace-period number to the rcutorture statistics printout, which allows it to be compared to the rcutorture "Writer stall state" message. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e526b56998af..6c9866a854b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1268,7 +1268,8 @@ void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) unsigned long s0 = 0, s1 = 0; idx = sp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", + tt, tf, rcu_seq_current(&sp->srcu_gp_seq), idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; -- cgit v1.2.3 From c7cd161ecb2188c07ba9560ca82aee756575359f Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:17 -0700 Subject: rcu: Assign higher prio to RCU threads if rcutorture is built-in The rcutorture RCU priority boosting tests fail even with CONFIG_RCU_BOOST set because rcutorture's threads run at the same priority as the default RCU kthreads (RT class with priority of 1). This patch checks if RCU torture is built into the kernel and if so, assigns RT priority 1 to the RCU threads, allowing the rcutorture boost tests to pass. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7498a416f63b..8143b8d40a6c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3923,12 +3923,16 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) kthread_prio = 1; else if (kthread_prio < 0) kthread_prio = 0; else if (kthread_prio > 99) kthread_prio = 99; + if (kthread_prio != kthread_prio_in) pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", kthread_prio, kthread_prio_in); -- cgit v1.2.3 From 028be12b294e3a059e6fc06852d458fdc82717ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 May 2018 09:20:34 -0700 Subject: rcutorture: Change units of onoff_interval to jiffies Some RCU bugs have been sensitive to the frequency of CPU-hotplug operations, which have been gradually increased over time. But this frequency is now at the one-second lower limit that can be specified using the rcutorture.onoff_interval kernel parameter. This commit therefore changes the units of rcutorture.onoff_interval from seconds to jiffies, and also sets the value specified for this kernel parameter in the TREE03 rcutorture scenario to 200, which is 200 milliseconds for HZ=1000. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0481c7286875..eb6d4915b4e6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -87,7 +87,7 @@ torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); + "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -1889,7 +1889,7 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); -- cgit v1.2.3 From 6bea2cc5a97b7e9677088b1a93e27edb74ae0e55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 May 2018 15:30:36 -0700 Subject: rcu: Remove rcutorture test version and sequence number Back when RCU had a debugfs interface, there was a test version and sequence number that allowed associating debugfs data with a particular test run, where the test run started with modprobe and ended with rmmod, which was how tests were run back on the old ABAT system within IBM. But rcutorture testing no longer runs on ABAT, and there is no longer an RCU debugfs interface, so there is no longer any need for test versions and sequence numbers. This commit therefore removes the rcutorture_record_test_transition() and rcutorture_record_progress() functions, and along with them the rcutorture_testseq and rcutorture_vernum variables that they update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ---- kernel/rcu/rcutorture.c | 4 +--- kernel/rcu/tree.c | 37 ------------------------------------- 3 files changed, 1 insertion(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index aa215d6355f8..0453a7d12b3f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -444,7 +444,6 @@ enum rcutorture_type { #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq); -void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -458,7 +457,6 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_test_transition(void) { } static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, @@ -505,8 +503,6 @@ static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); unsigned long rcu_sched_get_gp_seq(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eb6d4915b4e6..335387fabac2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1016,7 +1016,7 @@ rcu_torture_writer(void *arg) break; } } - rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_current_version++; /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1613,7 +1613,6 @@ rcu_torture_cleanup(void) unsigned long gp_seq = 0; int i; - rcutorture_record_test_transition(); if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); @@ -1918,7 +1917,6 @@ rcu_torture_init(void) goto unwind; } } - rcutorture_record_test_transition(); torture_init_end(); return 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3333ee2c6f5..65abb399b08d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -191,18 +191,6 @@ module_param(gp_cleanup_delay, int, 0444); */ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ -/* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is @@ -622,20 +610,6 @@ void show_rcu_gp_kthreads(void) } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -/* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -664,17 +638,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - /* * Return the root node of the specified rcu_state structure. */ -- cgit v1.2.3 From 2d3625841dcee549653b6f50ffa4e6431305035a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 May 2018 11:09:47 -0700 Subject: rcuperf: Remove unused torturing_tasks() function The torturing_tasks() function in rcuperf.c is not used, so this commit removes it. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b080bc4a4f45..06eb5e42726d 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,11 +369,6 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) return cur_ops->gp_diff(new, old); } -static bool __maybe_unused torturing_tasks(void) -{ - return cur_ops == &tasks_ops; -} - /* * If performance tests complete, wait for shutdown to commence. */ -- cgit v1.2.3 From 6b06aa723ed705102f3c63a494ac45352ccc0e7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 10:56:05 -0700 Subject: rcutorture: Extract common code from rcu_torture_reader() This commit extracts the code executed on each pass through the loop in rcu_torture_reader() into a new rcu_torture_one_read() function. This new function will also be used by rcu_torture_timer(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 98 +++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 335387fabac2..971e31ae9bcf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1089,6 +1089,60 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp) +{ + int idx; + unsigned long started; + unsigned long completed; + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + started = cur_ops->get_gp_seq(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + return false; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(trsp); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed = cur_ops->get_gp_seq(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, started, completed); + rcu_ftrace_dump(DUMP_ALL); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = rcutorture_seq_diff(completed, started); + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + return true; +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1165,14 +1219,8 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { - unsigned long started; - unsigned long completed; - int idx; DEFINE_TORTURE_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; struct timer_list t; - unsigned long long ts; VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); @@ -1184,44 +1232,8 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { -- cgit v1.2.3 From 8da9a59523b6608f4b21f3e489578d0993c0779f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:17:51 -0700 Subject: rcutorture: Use atomic increment for n_rcu_torture_timers Currently, rcu_torture_timer() relies on a lock to guard updates to n_rcu_torture_timers. Unfortunately, consolidating code with rcu_torture_reader() will dispense with this lock. This commit therefore makes n_rcu_torture_timers be an atomic_long_t and uses atomic_long_inc() to carry out the update. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e31ae9bcf..2452e4a29923 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -151,7 +151,7 @@ static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; +static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; static atomic_long_t n_cbfloods; @@ -1160,6 +1160,7 @@ static void rcu_torture_timer(struct timer_list *unused) int pipe_count; unsigned long long ts; + atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -1177,7 +1178,6 @@ static void rcu_torture_timer(struct timer_list *unused) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); cur_ops->read_delay(&rand); - n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); pipe_count = p->rtort_pipe_count; @@ -1290,7 +1290,7 @@ rcu_torture_stats_print(void) pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", n_barrier_successes, -- cgit v1.2.3 From 3025520ec424df8b0fd5cdc319ad6b83406d9954 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:38:47 -0700 Subject: rcutorture: Use per-CPU random state for rcu_torture_timer() Currently, the rcu_torture_timer() function uses a single global torture_random_state structure protected by a single global lock. This conflicts to some extent with performance and scalability, but even more with the goal of consolidating read-side testing with rcu_torture_reader(). This commit therefore creates a per-CPU torture_random_state structure for use by rcu_torture_timer() and eliminates the lock. Signed-off-by: Paul E. McKenney [ paulmck: Make rcu_torture_timer_rand static, per 0day Test Robot report. ] --- kernel/rcu/rcutorture.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2452e4a29923..d5a5465d2507 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1143,6 +1143,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) return true; } +static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1154,12 +1156,12 @@ static void rcu_torture_timer(struct timer_list *unused) int idx; unsigned long started; unsigned long completed; - static DEFINE_TORTURE_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + struct torture_random_state *trsp; unsigned long long ts; + trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); idx = cur_ops->readlock(); started = cur_ops->get_gp_seq(); @@ -1176,9 +1178,7 @@ static void rcu_torture_timer(struct timer_list *unused) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - spin_unlock(&rand_lock); + cur_ops->read_delay(trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { -- cgit v1.2.3 From 241b42522abb36c78cdc84d0cade358c4449306f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 May 2018 11:59:31 -0700 Subject: rcutorture: Make rcu_torture_timer() use rcu_torture_one_read() This commit saves a few lines of code by making rcu_torture_timer() invoke rcu_torture_one_read(), thus completing the consolidation of code between rcu_torture_timer() and rcu_torture_reader(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d5a5465d2507..ac700aa6dcaf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1153,53 +1153,8 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); */ static void rcu_torture_timer(struct timer_list *unused) { - int idx; - unsigned long started; - unsigned long completed; - struct rcu_torture *p; - int pipe_count; - struct torture_random_state *trsp; - unsigned long long ts; - - trsp = this_cpu_ptr(&rcu_torture_timer_rand); atomic_long_inc(&n_rcu_torture_timers); - idx = cur_ops->readlock(); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed = cur_ops->get_gp_seq(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - started, completed); - rcu_ftrace_dump(DUMP_ALL); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { -- cgit v1.2.3 From 2397d072f76b552fc21cda19686d24a8066ced22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 May 2018 07:29:25 -0700 Subject: rcutorture: Handle extended read-side critical sections This commit enables rcutorture to test whether RCU properly aggregates different types of read-side critical sections into a larger section covering the set. It does this by extending an initial read-side critical section randomly for a random number of extensions. There is a new rcu_torture_ops field ->extendable that specifies what extensions are permitted for a given flavor of RCU (for example, SRCU does not permit any extensions, while RCU-sched permits all types). Note that if a given operation (for example, local_bh_disable()) extends an RCU read-side critical section, then rcutorture feels free to also start and end the critical section with that operation's type of disabling. Disabling operations include local_bh_disable(), local_irq_disable(), and preempt_disable(). This commit also adds a new "busted_srcud" torture type, which verifies rcutorture's ability to detect extensions of RCU read-side critical sections that are not handled. Gotta test the test, after all! Note that it is not legal to invoke local_bh_disable() with interrupts disabled, and this transition is avoided by overriding the random-number generator when it wants to call local_bh_disable() while interrupts are disabled. The code instead leaves both interrupts and bh/softirq disabled in this case. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ac700aa6dcaf..f97757755207 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -62,6 +62,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +/* Bits for ->extendables field, extendables param, and related definitions. */ +#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ +#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ + RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ + /* Must be power of two minus one. */ + torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); torture_param(int, cbflood_intra_holdoff, 1, @@ -69,6 +81,8 @@ torture_param(int, cbflood_intra_holdoff, 1, torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); torture_param(int, cbflood_n_per_burst, 20000, "# callbacks per burst in flood"); +torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, + "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -277,6 +291,8 @@ struct rcu_torture_ops { void (*stats)(void); int irq_capable; int can_boost; + int extendables; + int ext_irq_conflict; const char *name; }; @@ -452,6 +468,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), + .ext_irq_conflict = RCUTORTURE_RDR_RCU, .name = "rcu_bh" }; @@ -622,6 +640,26 @@ static struct rcu_torture_ops srcud_ops = { .name = "srcud" }; +/* As above, but broken due to inappropriate reader extension. */ +static struct rcu_torture_ops busted_srcud_ops = { + .ttype = SRCU_FLAVOR, + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .get_gp_seq = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, + .name = "busted_srcud" +}; + /* * Definitions for sched torture testing. */ @@ -660,6 +698,7 @@ static struct rcu_torture_ops sched_ops = { .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "sched" }; @@ -1089,6 +1128,110 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +/* + * Do one extension of an RCU read-side critical section using the + * current reader state in readstate (set to zero for initial entry + * to extended critical section), set the new state as specified by + * newstate (set to zero for final exit from extended critical section), + * and random-number-generator state in trsp. If this is neither the + * beginning or end of the critical section and if there was actually a + * change, do a ->read_delay(). + */ +static void rcutorture_one_extend(int *readstate, int newstate, + struct torture_random_state *trsp) +{ + int idxnew = -1; + int idxold = *readstate; + int statesnew = ~*readstate & newstate; + int statesold = *readstate & ~newstate; + + WARN_ON_ONCE(idxold < 0); + WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + + /* First, put new protection in place to avoid critical-section gap. */ + if (statesnew & RCUTORTURE_RDR_BH) + local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_IRQ) + local_irq_disable(); + if (statesnew & RCUTORTURE_RDR_PREEMPT) + preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RCU) + idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + + /* Next, remove old protection, irq first due to bh conflict. */ + if (statesold & RCUTORTURE_RDR_IRQ) + local_irq_enable(); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_PREEMPT) + preempt_enable(); + if (statesold & RCUTORTURE_RDR_RCU) + cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + + /* Delay if neither beginning nor end and there was a change. */ + if ((statesnew || statesold) && *readstate && newstate) + cur_ops->read_delay(trsp); + + /* Update the reader state. */ + if (idxnew == -1) + idxnew = idxold & ~RCUTORTURE_RDR_MASK; + WARN_ON_ONCE(idxnew < 0); + WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); + *readstate = idxnew | newstate; + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); + WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); +} + +/* Return the biggest extendables mask given current RCU and boot parameters. */ +static int rcutorture_extend_mask_max(void) +{ + int mask; + + WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); + mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; + mask = mask | RCUTORTURE_RDR_RCU; + return mask; +} + +/* Return a random protection state mask, but with at least one bit set. */ +static int +rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) +{ + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & RCUTORTURE_RDR_BH) && + (oldmask & RCUTORTURE_RDR_BH)) + mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + if ((mask & RCUTORTURE_RDR_IRQ) && + !(mask & cur_ops->ext_irq_conflict) && + (oldmask & cur_ops->ext_irq_conflict)) + mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ + return mask ?: RCUTORTURE_RDR_RCU; +} + +/* + * Do a randomly selected number of extensions of an existing RCU read-side + * critical section. + */ +static void rcutorture_loop_extend(int *readstate, + struct torture_random_state *trsp) +{ + int i; + int mask = rcutorture_extend_mask_max(); + + WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ + if (!((mask - 1) & mask)) + return; /* Current RCU flavor not extendable. */ + i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; + while (i--) { + mask = rcutorture_extend_mask(*readstate, trsp); + rcutorture_one_extend(readstate, mask, trsp); + } +} + /* * Do one read-side critical section, returning false if there was * no data to read. Can be invoked both from process context and @@ -1096,14 +1239,16 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { - int idx; unsigned long started; unsigned long completed; + int newstate; struct rcu_torture *p; int pipe_count; + int readstate = 0; unsigned long long ts; - idx = cur_ops->readlock(); + newstate = rcutorture_extend_mask(readstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1113,12 +1258,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(trsp); + rcutorture_loop_extend(&readstate, trsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -1139,7 +1284,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - cur_ops->readunlock(idx); + rcutorture_one_extend(&readstate, 0, trsp); + WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); return true; } @@ -1704,7 +1850,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, &tasks_ops, + &busted_srcud_ops, &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) -- cgit v1.2.3 From bf1bef50bee13b2292929f4b86118302a3827a32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Jun 2018 08:50:09 -0700 Subject: rcutorture: Emphasize testing of single reader protection type For RCU implementations supporting multiple types of reader protection, rcutorture currently randomly selects the combinations of types of protection for each phase of each reader. The problem with this, for example, given the four kinds of protection for RCU-sched (local_irq_disable(), local_bh_disable(), preempt_disable(), and rcu_read_lock_sched()), the reader will be protected by a single mechanism only 25% of the time. We really heavier testing of single read-side mechanisms. This commit therefore uses only a single mechanism about 60% of the time, half of the time explicitly and one-eighth of the time by chance. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f97757755207..aa0be7ec2a26 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -69,6 +69,7 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett > 8; + unsigned long randmask2 = randmask1 >> 1; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - mask = mask & (torture_random(trsp) >> RCUTORTURE_RDR_SHIFT); + /* Half the time lots of bits, half the time only one bit. */ + if (randmask1 & 0x1) + mask = mask & randmask2; + else + mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); if ((mask & RCUTORTURE_RDR_IRQ) && !(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) -- cgit v1.2.3 From 450efca7182a516a12dfcc0311abfd242bde42b2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:43 -0700 Subject: rcutorture: Disable RT throttling for boost tests Currently rcutorture is not able to torture RCU boosting properly. This is because the rcutorture's boost threads which are doing the torturing may be throttled due to RT throttling. This patch makes rcutorture use the right torture technique (unthrottled rcutorture boost tasks) for torturing RCU so that the test fails correctly when no boost is available. Currently this requires accessing sysctl_sched_rt_runtime directly, but that should be Ok since rcutorture is test code. Such direct access is also only possible if rcutorture is used as a built-in so make it conditional on that. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index aa0be7ec2a26..74e47d0a618c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,6 +55,7 @@ #include #include #include +#include #include "rcu.h" @@ -772,6 +773,32 @@ static void rcu_torture_boost_cb(struct rcu_head *head) smp_store_release(&rbip->inflight, 0); } +static int old_rt_runtime = -1; + +static void rcu_torture_disable_rt_throttle(void) +{ + /* + * Disable RT throttling so that rcutorture's boost threads don't get + * throttled. Only possible if rcutorture is built-in otherwise the + * user should manually do this by setting the sched_rt_period_us and + * sched_rt_runtime sysctls. + */ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime != -1) + return; + + old_rt_runtime = sysctl_sched_rt_runtime; + sysctl_sched_rt_runtime = -1; +} + +static void rcu_torture_enable_rt_throttle(void) +{ + if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime == -1) + return; + + sysctl_sched_rt_runtime = old_rt_runtime; + old_rt_runtime = -1; +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -1511,6 +1538,7 @@ static int rcutorture_booster_cleanup(unsigned int cpu) mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; + rcu_torture_enable_rt_throttle(); mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ @@ -1527,6 +1555,7 @@ static int rcutorture_booster_init(unsigned int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); + rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), -- cgit v1.2.3 From 3b745c8969c752601cb68c82a06735363563ab42 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 10 Jun 2018 16:45:44 -0700 Subject: rcutorture: Make boost test more robust Currently, with RCU_BOOST disabled, I get no failures when forcing rcutorture to test RCU boost priority inversion. The reason seems to be that we don't check for failures if the callback never ran at all for the duration of the boost-test loop. Further, the 'rtb' and 'rtbf' counters seem to be used inconsistently. 'rtb' is incremented at the start of each test and 'rtbf' is incremented per-cpu on each failure of call_rcu. So its possible 'rtbf' > 'rtb'. To test the boost with rcutorture, I did following on a 4-CPU x86 machine: modprobe rcutorture test_boost=2 sleep 20 rmmod rcutorture With patch: rtbf: 8 rtb: 12 Without patch: rtbf: 0 rtb: 2 In summary this patch: - Increments failed and total test counters once per boost-test. - Checks for failure cases correctly. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74e47d0a618c..36b9b8266213 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -799,6 +799,18 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } +static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +{ + if (end - start > test_boost_duration * HZ - HZ / 2) { + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + + return true; /* failed */ + } + + return false; /* passed */ +} + static int rcu_torture_boost(void *arg) { unsigned long call_rcu_time; @@ -819,6 +831,21 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { + /* Track if the test failed already in this test interval? */ + bool failed = false; + + /* Increment n_rcu_torture_boosts once per boost-test */ + while (!kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + if (kthread_should_stop()) + goto checkwait; + /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { @@ -837,11 +864,10 @@ static int rcu_torture_boost(void *arg) /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } + /* Check if the boost test failed */ + failed = failed || + rcu_torture_boost_failed(call_rcu_time, + jiffies); call_rcu_time = jiffies; } stutter_wait("rcu_torture_boost"); @@ -849,6 +875,14 @@ static int rcu_torture_boost(void *arg) goto checkwait; } + /* + * If boost never happened, then inflight will always be 1, in + * this case the boost check would never happen in the above + * loop so do another one here. + */ + if (!failed && smp_load_acquire(&rbi.inflight)) + rcu_torture_boost_failed(call_rcu_time, jiffies); + /* * Set the start time of the next test interval. * Yes, this is vulnerable to long delays, but such @@ -861,7 +895,6 @@ static int rcu_torture_boost(void *arg) if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; - n_rcu_torture_boosts++; mutex_unlock(&boost_mutex); break; } -- cgit v1.2.3 From 622be33fcbc93e9b672b99ed338369eb5e843ac3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:47:34 +0200 Subject: rcutorture: Use monotonic timestamp for stall detection The get_seconds() call is deprecated because it overflows on 32-bit architectures. The algorithm in rcu_torture_stall() can deal with the overflow, but another problem here is that using a CLOCK_REALTIME stamp can lead to a false-positive stall warning when a settimeofday() happens concurrently. Using ktime_get_seconds() instead avoids those issues and will never overflow. The added cast to 'unsigned long' however is necessary to make ULONG_CMP_LT() work correctly. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36b9b8266213..049b3735dba8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1622,7 +1622,7 @@ static int rcu_torture_stall(void *args) VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; + stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ rcu_read_lock(); if (stall_cpu_irqsoff) @@ -1631,7 +1631,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", smp_processor_id()); - while (ULONG_CMP_LT(get_seconds(), stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), + stop_at)) continue; /* Induce RCU CPU stall warning. */ if (stall_cpu_irqsoff) local_irq_enable(); -- cgit v1.2.3 From 4babd855fd6137f9792117eb73b096c221a49d3c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:18 -0700 Subject: rcutorture: Add support to detect if boost kthread prio is too low When rcutorture is built in to the kernel, an earlier patch detects that and raises the priority of RCU's kthreads to allow rcutorture's RCU priority boosting tests to succeed. However, if rcutorture is built as a module, those priorities must be raised manually via the rcutree.kthread_prio kernel boot parameter. If this manual step is not taken, rcutorture's RCU priority boosting tests will fail due to kthread starvation. One approach would be to raise the default priority, but that risks breaking existing users. Another approach would be to allow runtime adjustment of RCU's kthread priorities, but that introduces numerous "interesting" race conditions. This patch therefore instead detects too-low priorities, and prints a message and disables the RCU priority boosting tests in that case. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/rcutorture.c | 32 ++++++++++++++++++++++++++++---- kernel/rcu/tree.c | 7 +++++++ 3 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0453a7d12b3f..bee070979970 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,7 @@ static inline void rcu_force_quiescent_state(void) { } static inline void rcu_bh_force_quiescent_state(void) { } static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } +static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); unsigned long rcu_bh_get_gp_seq(void); @@ -510,6 +511,7 @@ unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); +int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 049b3735dba8..e3d2d4f1d928 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1787,6 +1787,32 @@ static void rcu_torture_barrier_cleanup(void) } } +static bool rcu_torture_can_boost(void) +{ + static int boost_warn_once; + int prio; + + if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) + return false; + + prio = rcu_get_gp_kthreads_prio(); + if (!prio) + return false; + + if (prio < 2) { + if (boost_warn_once == 1) + return false; + + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " + "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " + "on the kernel command line.\n", KBUILD_MODNAME); + boost_warn_once = 1; + return false; + } + + return true; +} + static enum cpuhp_state rcutor_hp; static void @@ -1831,8 +1857,7 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) + if (rcu_torture_can_boost()) cpuhp_remove_state(rcutor_hp); /* @@ -2056,8 +2081,7 @@ rcu_torture_init(void) test_boost_interval = 1; if (test_boost_duration < 2) test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { + if (rcu_torture_can_boost()) { boost_starttime = jiffies + test_boost_interval * HZ; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65abb399b08d..b4bcb5e21ca6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -180,6 +180,13 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* Retreive RCU kthreads priority for rcutorture */ +int rcu_get_gp_kthreads_prio(void) +{ + return kthread_prio; +} +EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); + /* * Number of grace periods between delays, normalized by the duration of * the delay. The longer the delay, the more the grace periods between -- cgit v1.2.3 From bf5b64355a3ce41752856b66c4efad4d7a88e84b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 19 Jun 2018 15:14:19 -0700 Subject: rcutorture: Fix rcu_barrier successes counter The rcutorture test module currently increments both successes and error for the barrier test upon error, which results in misleading statistics being printed. This commit therefore changes the code to increment the success counter only when the test actually passes. This change was tested by by returning from the barrier callback without incrementing the callback counter, thus introducing what appeared to rcutorture to be rcu_barrier() failures. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e3d2d4f1d928..bdc86cdf3b8b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -169,7 +169,7 @@ static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; -static long n_barrier_successes; +static long n_barrier_successes; /* did rcu_barrier test succeed? */ static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; @@ -1723,8 +1723,9 @@ static int rcu_torture_barrier(void *arg) atomic_read(&barrier_cbs_invoked), n_barrier_cbs); WARN_ON_ONCE(1); + } else { + n_barrier_successes++; } - n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_barrier"); @@ -1803,9 +1804,7 @@ static bool rcu_torture_can_boost(void) if (boost_warn_once == 1) return false; - pr_alert("%s: WARN: RCU kthread priority too low to test boosting. " - "Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 " - "on the kernel command line.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); boost_warn_once = 1; return false; } -- cgit v1.2.3 From 8e1b706b6e819bed215c0db16345568864660393 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 13 Jul 2018 16:23:23 +0200 Subject: cpu/hotplug: Expose SMT control init function The L1TF mitigation will gain a commend line parameter which allows to set a combination of hypervisor mitigation and SMT control. Expose cpu_smt_disable() so the command line parser can tweak SMT settings. [ tglx: Split out of larger patch and made it preserve an already existing force off state ] Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20180713142323.039715135@linutronix.de --- kernel/cpu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d79e24df2420..8453e31f2d1a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -347,13 +347,23 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); -static int __init smt_cmdline_disable(char *str) +void __init cpu_smt_disable(bool force) { - cpu_smt_control = CPU_SMT_DISABLED; - if (str && !strcmp(str, "force")) { + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return; + + if (force) { pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; + } else { + cpu_smt_control = CPU_SMT_DISABLED; } +} + +static int __init smt_cmdline_disable(char *str) +{ + cpu_smt_disable(str && !strcmp(str, "force")); return 0; } early_param("nosmt", smt_cmdline_disable); -- cgit v1.2.3 From fee0aede6f4739c87179eca76136f83210953b86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2018 16:23:24 +0200 Subject: cpu/hotplug: Set CPU_SMT_NOT_SUPPORTED early The CPU_SMT_NOT_SUPPORTED state is set (if the processor does not support SMT) when the sysfs SMT control file is initialized. That was fine so far as this was only required to make the output of the control file correct and to prevent writes in that case. With the upcoming l1tf command line parameter, this needs to be set up before the L1TF mitigation selection and command line parsing happens. Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20180713142323.121795971@linutronix.de --- kernel/cpu.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8453e31f2d1a..37eec872042b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -361,6 +361,16 @@ void __init cpu_smt_disable(bool force) } } +/* + * The decision whether SMT is supported can only be done after the full + * CPU identification. Called from architecture code. + */ +void __init cpu_smt_check_topology(void) +{ + if (!topology_smt_supported()) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); @@ -2115,9 +2125,6 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { - if (!topology_smt_supported()) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; - return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -- cgit v1.2.3 From be45bf5395e0886a93fc816bbe41a008ec2e42e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Jul 2018 12:42:08 +0200 Subject: watchdog/softlockup: Fix cpu_stop_queue_work() double-queue bug When scheduling is delayed for longer than the softlockup interrupt period it is possible to double-queue the cpu_stop_work, causing list corruption. Cure this by adding a completion to track the cpu_stop_work's progress. Reported-by: kernel test robot Tested-by: Rong Chen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 9cf57731b63e ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work") Link: http://lkml.kernel.org/r/20180713104208.GW2494@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b81f777838d5..5470dce212c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -330,6 +330,9 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } +static DEFINE_PER_CPU(struct completion, softlockup_completion); +static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); + /* * The watchdog thread function - touches the timestamp. * @@ -343,12 +346,11 @@ static int softlockup_fn(void *data) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); + complete(this_cpu_ptr(&softlockup_completion)); return 0; } -static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -364,9 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) watchdog_interrupt_count(); /* kick the softlockup detector */ - stop_one_cpu_nowait(smp_processor_id(), - softlockup_fn, NULL, - this_cpu_ptr(&softlockup_stop_work)); + if (completion_done(this_cpu_ptr(&softlockup_completion))) { + reinit_completion(this_cpu_ptr(&softlockup_completion)); + stop_one_cpu_nowait(smp_processor_id(), + softlockup_fn, NULL, + this_cpu_ptr(&softlockup_stop_work)); + } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -467,9 +472,13 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); + init_completion(done); + complete(done); + /* * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. @@ -499,6 +508,7 @@ static void watchdog_disable(unsigned int cpu) */ watchdog_nmi_disable(cpu); hrtimer_cancel(hrtimer); + wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) -- cgit v1.2.3 From 8fe5c5a937d0f4e84221631833a2718afde52285 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 12 Jun 2018 12:22:15 +0100 Subject: sched/fair: Fix util_avg of new tasks for asymmetric systems When a new task wakes-up for the first time, its initial utilization is set to half of the spare capacity of its CPU. The current implementation of post_init_entity_util_avg() uses SCHED_CAPACITY_SCALE directly as a capacity reference. As a result, on a big.LITTLE system, a new task waking up on an idle little CPU will be given ~512 of util_avg, even if the CPU's capacity is significantly less than that. Fix this by computing the spare capacity with arch_scale_cpu_capacity(). Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Link: http://lkml.kernel.org/r/20180612112215.25448-1-quentin.perret@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 321cd5dcf2e8..08b89ae34233 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -735,11 +735,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * - * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * - * where n denotes the nth task. + * where n denotes the nth task and cpu_scale the CPU capacity. * - * For example, a simplest series from the beginning would be like: + * For example, for a CPU with 1024 of capacity, a simplest series from + * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... @@ -751,7 +752,8 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { -- cgit v1.2.3 From c079629862b20c101e8336362a8b042ec7d942fe Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:04 +0200 Subject: sched/pelt: Move PELT related code in a dedicated file We want to track rt_rq's utilization as a part of the estimation of the whole rq's utilization. This is necessary because rt tasks can steal utilization to cfs tasks and make them lighter than they are. As we want to use the same load tracking mecanism for both and prevent useless dependency between cfs and rt code, PELT code is moved in a dedicated file. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/fair.c | 333 +------------------------------------------------- kernel/sched/pelt.c | 311 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 43 +++++++ kernel/sched/sched.h | 19 +++ 5 files changed, 375 insertions(+), 333 deletions(-) create mode 100644 kernel/sched/pelt.c create mode 100644 kernel/sched/pelt.h (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index d9a02b318108..7fe183404c38 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08b89ae34233..39ab46cea6c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se) (!se->my_q) - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return container_of(cfs_rq, struct rq, cfs); } -#define entity_is_task(se) 1 #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP - +#include "pelt.h" #include "sched-pelt.h" static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -2751,19 +2747,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) #ifdef CONFIG_SMP -/* - * XXX we want to get rid of these helpers and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ - return scale_load_down(se->load.weight); -} - -static inline long se_runnable(struct sched_entity *se) -{ - return scale_load_down(se->runnable_weight); -} - static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3064,314 +3047,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } #ifdef CONFIG_SMP -/* - * Approximate: - * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) - */ -static u64 decay_load(u64 val, u64 n) -{ - unsigned int local_n; - - if (unlikely(n > LOAD_AVG_PERIOD * 63)) - return 0; - - /* after bounds checking we can collapse to 32-bit */ - local_n = n; - - /* - * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) - * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { - val >>= local_n / LOAD_AVG_PERIOD; - local_n %= LOAD_AVG_PERIOD; - } - - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); - return val; -} - -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) -{ - u32 c1, c2, c3 = d3; /* y^0 == 1 */ - - /* - * c1 = d1 y^p - */ - c1 = decay_load((u64)d1, periods); - - /* - * p-1 - * c2 = 1024 \Sum y^n - * n=1 - * - * inf inf - * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) - * n=0 n=p - */ - c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; - - return c1 + c2 + c3; -} - -/* - * Accumulate the three separate parts of the sum; d1 the remainder - * of the last (incomplete) period, d2 the span of full periods and d3 - * the remainder of the (incomplete) current period. - * - * d1 d2 d3 - * ^ ^ ^ - * | | | - * |<->|<----------------->|<--->| - * ... |---x---|------| ... |------|-----x (now) - * - * p-1 - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 - * n=1 - * - * = u y^p + (Step 1) - * - * p-1 - * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) - * n=1 - */ -static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - unsigned long scale_freq, scale_cpu; - u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ - u64 periods; - - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - delta += sa->period_contrib; - periods = delta / 1024; /* A period is 1024us (~1ms) */ - - /* - * Step 1: decay old *_sum if we crossed period boundaries. - */ - if (periods) { - sa->load_sum = decay_load(sa->load_sum, periods); - sa->runnable_load_sum = - decay_load(sa->runnable_load_sum, periods); - sa->util_sum = decay_load((u64)(sa->util_sum), periods); - - /* - * Step 2 - */ - delta %= 1024; - contrib = __accumulate_pelt_segments(periods, - 1024 - sa->period_contrib, delta); - } - sa->period_contrib = delta; - - contrib = cap_scale(contrib, scale_freq); - if (load) - sa->load_sum += load * contrib; - if (runnable) - sa->runnable_load_sum += runnable * contrib; - if (running) - sa->util_sum += contrib * scale_cpu; - - return periods; -} - -/* - * We can represent the historical contribution to runnable average as the - * coefficients of a geometric series. To do this we sub-divide our runnable - * history into segments of approximately 1ms (1024us); label the segment that - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. - * - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... - * p0 p1 p2 - * (now) (~1ms ago) (~2ms ago) - * - * Let u_i denote the fraction of p_i that the entity was runnable. - * - * We then designate the fractions u_i as our co-efficients, yielding the - * following representation of historical load: - * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... - * - * We choose y based on the with of a reasonably scheduling period, fixing: - * y^32 = 0.5 - * - * This means that the contribution to load ~32ms ago (u_32) will be weighted - * approximately half as much as the contribution to load within the last ms - * (u_0). - * - * When a period "rolls over" and we have new u_0`, multiplying the previous - * sum again by y is sufficient to update: - * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) - * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] - */ -static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long load, unsigned long runnable, int running) -{ - u64 delta; - - delta = now - sa->last_update_time; - /* - * This should only happen when time goes backwards, which it - * unfortunately does during sched clock init when we swap over to TSC. - */ - if ((s64)delta < 0) { - sa->last_update_time = now; - return 0; - } - - /* - * Use 1024ns as the unit of measurement since it's a reasonable - * approximation of 1us and fast to compute. - */ - delta >>= 10; - if (!delta) - return 0; - - sa->last_update_time += delta << 10; - - /* - * running is a subset of runnable (weight) so running can't be set if - * runnable is clear. But there are some corner cases where the current - * se has been already dequeued but cfs_rq->curr still points to it. - * This means that weight will be 0 but not running for a sched_entity - * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages() - */ - if (!load) - runnable = running = 0; - - /* - * Now we know we crossed measurement unit boundaries. The *_avg - * accrues by two steps: - * - * Step 1: accumulate *_sum since last_update_time. If we haven't - * crossed period boundaries, finish. - */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) - return 0; - - return 1; -} - -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) -{ - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; - - /* - * Step 2: update *_avg. - */ - sa->load_avg = div_u64(load * sa->load_sum, divider); - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; -} - -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - -static inline void cfs_se_util_change(struct sched_avg *avg) -{ - unsigned int enqueued; - - if (!sched_feat(UTIL_EST)) - return; - - /* Avoid store if the flag has been already set */ - enqueued = avg->util_est.enqueued; - if (!(enqueued & UTIL_AVG_UNCHANGED)) - return; - - /* Reset flag to report util_avg has been updated */ - enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); -} - -/* - * sched_entity: - * - * task: - * se_runnable() == se_weight() - * - * group: [ see update_cfs_group() ] - * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - * load_sum := runnable_sum - * load_avg = se_weight(se) * runnable_avg - * - * runnable_load_sum := runnable_sum - * runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum - * - * cfq_rs: - * - * load_sum = \Sum se_weight(se) * se->avg.load_sum - * load_avg = \Sum se->avg.load_avg - * - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - * runnable_load_avg = \Sum se->avg.runable_load_avg - */ - -static int -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - return 1; - } - - return 0; -} - -static int -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (entity_is_task(se)) - se->runnable_weight = se->load.weight; - - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, - cfs_rq->curr == se)) { - - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); - cfs_se_util_change(&se->avg); - return 1; - } - - return 0; -} - -static int -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) -{ - if (___update_load_sum(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - scale_load_down(cfs_rq->runnable_weight), - cfs_rq->curr != NULL)) { - - ___update_load_avg(&cfs_rq->avg, 1, 1); - return 1; - } - - return 0; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -4039,12 +3714,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) #else /* CONFIG_SMP */ -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c new file mode 100644 index 000000000000..e6ecbb2b8698 --- /dev/null +++ b/kernel/sched/pelt.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Per Entity Load Tracking + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Move PELT related code from fair.c into this pelt.c file + * Author: Vincent Guittot + */ + +#include +#include "sched.h" +#include "sched-pelt.h" +#include "pelt.h" + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; +} + +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^p + (Step 1) + * + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; + + scale_freq = arch_scale_freq_capacity(cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + u64 delta; + + delta = now - sa->last_update_time; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_update_time = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + + sa->last_update_time += delta << 10; + + /* + * running is a subset of runnable (weight) so running can't be set if + * runnable is clear. But there are some corner cases where the current + * se has been already dequeued but cfs_rq->curr still points to it. + * This means that weight will be 0 but not running for a sched_entity + * but also for a cfs_rq if the latter becomes idle. As an example, + * this happens during idle_balance() which calls + * update_blocked_averages() + */ + if (!load) + runnable = running = 0; + + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + return 0; + + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->util_avg = sa->util_sum / divider; +} + +/* + * sched_entity: + * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * + * cfq_rq: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg + */ + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; +} + +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + cfs_se_util_change(&se->avg); + return 1; + } + + return 0; +} + +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h new file mode 100644 index 000000000000..9cac73efd64a --- /dev/null +++ b/kernel/sched/pelt.h @@ -0,0 +1,43 @@ +#ifdef CONFIG_SMP + +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); + +/* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. + * This flag is used to synchronize util_avg updates with util_est updates. + * We map this information into the LSB bit of the utilization saved at + * dequeue time (i.e. util_est.dequeued). + */ +#define UTIL_AVG_UNCHANGED 0x1 + +static inline void cfs_se_util_change(struct sched_avg *avg) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Avoid store if the flag has been already set */ + enqueued = avg->util_est.enqueued; + if (!(enqueued & UTIL_AVG_UNCHANGED)) + return; + + /* Reset flag to report util_avg has been updated */ + enqueued &= ~UTIL_AVG_UNCHANGED; + WRITE_ONCE(avg->util_est.enqueued, enqueued); +} + +#else + +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + return 0; +} + +#endif + + diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7742dcc136c..00d6f2594c4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -673,7 +673,26 @@ struct dl_rq { u64 bw_ratio; }; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + #ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} static inline bool sched_asym_prefer(int a, int b) { -- cgit v1.2.3 From 371bf42732694d142b0de026e152266c039b97d3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:05 +0200 Subject: sched/rt: Add rt_rq utilization tracking schedutil governor relies on cfs_rq's util_avg to choose the OPP when CFS tasks are running. When the CPU is overloaded by CFS and RT tasks, CFS tasks are preempted by RT tasks and in this case util_avg reflects the remaining capacity but not what CFS want to use. In such case, schedutil can select a lower OPP whereas the CPU is overloaded. In order to have a more accurate view of the utilization of the CPU, we track the utilization of RT tasks. Only util_avg is correctly tracked but not load_avg and runnable_load_avg which are useless for rt_rq. rt_rq uses rq_clock_task and cfs_rq uses cfs_rq_clock_task but they are the same at the root group level, so the PELT windows of the util_sum are aligned. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++++++- kernel/sched/pelt.c | 25 +++++++++++++++++++++++++ kernel/sched/pelt.h | 7 +++++++ kernel/sched/rt.c | 13 +++++++++++++ kernel/sched/sched.h | 7 +++++++ 5 files changed, 66 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 39ab46cea6c5..5b453213cd18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,6 +7290,14 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } +static inline bool rt_rq_has_blocked(struct rq *rq) +{ + if (READ_ONCE(rq->avg_rt.util_avg)) + return true; + + return false; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7349,6 +7357,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + /* Don't need periodic decay once load/util_avg are null */ + if (rt_rq_has_blocked(rq)) + done = false; #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7414,9 +7426,10 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !rt_rq_has_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index e6ecbb2b8698..a00b1ba3dd5b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -309,3 +309,28 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) return 0; } + +/* + * rt_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + * load_avg and runnable_load_avg are not supported and meaningless. + * + */ + +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_rt, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9cac73efd64a..b2983b741d57 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -3,6 +3,7 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); /* * When a task is dequeued, its estimated utilization should not be update if @@ -38,6 +39,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) return 0; } +static inline int +update_rt_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} + #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 572567078b60..0dc8ad1915e6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,6 +5,8 @@ */ #include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -1576,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) rt_queue_push_tasks(rq); + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1583,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + /* * The previous task needs to be made eligible for pushing * if it is still active @@ -2312,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 00d6f2594c4e..405dd9ba6b39 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; + #endif /* CONFIG_SMP */ int rt_queued; @@ -854,6 +855,7 @@ struct rq { u64 rt_avg; u64 age_stamp; + struct sched_avg avg_rt; u64 idle_stamp; u64 avg_idle; @@ -2212,4 +2214,9 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) return util; } + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return rq->avg_rt.util_avg; +} #endif -- cgit v1.2.3 From 3ae117c6cd7c4783819a0766aa97b9493a8a0f62 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:06 +0200 Subject: cpufreq/schedutil: Use RT utilization tracking Add both CFS and RT utilization when selecting an OPP for CFS tasks as RT can preempt and steal CFS's running time. RT util_avg is used to take into account the utilization of RT tasks on the CPU when selecting OPP. If a RT task migrate, the RT utilization will not migrate but will decay over time. On an overloaded CPU, CFS utilization reflects the remaining utilization avialable on CPU. When RT task migrates, the CFS utilization will increase when tasks will start to use the newly available capacity. At the same pace, RT utilization will decay and both variations will compensate each other to keep unchanged overall utilization and will prevent any OPP drop. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c907fde01eaa..da29b5a33adb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -56,6 +56,7 @@ struct sugov_cpu { /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; unsigned long util_dl; + unsigned long util_rt; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -186,15 +187,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->util_cfs = cpu_util_cfs(rq); sg_cpu->util_dl = cpu_util_dl(rq); + sg_cpu->util_rt = cpu_util_rt(rq); } static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util; if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; + util = sg_cpu->util_dl; + util += sg_cpu->util_cfs; + util += sg_cpu->util_rt; + /* * Utilization required by DEADLINE must always be granted while, for * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to @@ -205,7 +212,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); + return min(sg_cpu->max, util); } /** -- cgit v1.2.3 From 3727e0e16340cbdf83818f5bf0113505c6876057 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:07 +0200 Subject: sched/dl: Add dl_rq utilization tracking Similarly to what happens with RT tasks, CFS tasks can be preempted by DL tasks and the CFS's utilization might no longer describes the real utilization level. Current DL bandwidth reflects the requirements to meet deadline when tasks are enqueued but not the current utilization of the DL sched class. We track DL class utilization to estimate the system utilization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 6 ++++++ kernel/sched/fair.c | 11 ++++++++--- kernel/sched/pelt.c | 23 +++++++++++++++++++++++ kernel/sched/pelt.h | 6 ++++++ kernel/sched/sched.h | 1 + 5 files changed, 44 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..f4de26982d80 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,7 @@ * Fabio Checconi */ #include "sched.h" +#include "pelt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1761,6 +1762,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + return p; } @@ -1768,6 +1772,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1784,6 +1789,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b453213cd18..f096275c7df2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,11 +7290,14 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } -static inline bool rt_rq_has_blocked(struct rq *rq) +static inline bool others_rqs_have_blocked(struct rq *rq) { if (READ_ONCE(rq->avg_rt.util_avg)) return true; + if (READ_ONCE(rq->avg_dl.util_avg)) + return true; + return false; } @@ -7358,8 +7361,9 @@ static void update_blocked_averages(int cpu) done = false; } update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); /* Don't need periodic decay once load/util_avg are null */ - if (rt_rq_has_blocked(rq)) + if (others_rqs_have_blocked(rq)) done = false; #ifdef CONFIG_NO_HZ_COMMON @@ -7427,9 +7431,10 @@ static inline void update_blocked_averages(int cpu) update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !rt_rq_has_blocked(rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a00b1ba3dd5b..8b78b6320cda 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -334,3 +334,26 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } + +/* + * dl_rq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_dl, 1, 1); + return 1; + } + + return 0; +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index b2983b741d57..0e4f912461ad 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -4,6 +4,7 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); +int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); /* * When a task is dequeued, its estimated utilization should not be update if @@ -45,6 +46,11 @@ update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +static inline int +update_dl_rq_load_avg(u64 now, struct rq *rq, int running) +{ + return 0; +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 405dd9ba6b39..ab8b5296b5f6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,6 +856,7 @@ struct rq { u64 rt_avg; u64 age_stamp; struct sched_avg avg_rt; + struct sched_avg avg_dl; u64 idle_stamp; u64 avg_idle; -- cgit v1.2.3 From 8cc90515a4fa419ccfc4703ff127699cdcb96839 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:08 +0200 Subject: cpufreq/schedutil: Use DL utilization tracking Now that we have both the DL class bandwidth requirement and the DL class utilization, we can detect when CPU is fully used so we should run at max. Otherwise, we keep using the DL bandwidth requirement to define the utilization of the CPU. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 23 +++++++++++++++++------ kernel/sched/sched.h | 7 ++++++- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index da29b5a33adb..07760bc7f69a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -56,6 +56,7 @@ struct sugov_cpu { /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; unsigned long util_dl; + unsigned long bw_dl; unsigned long util_rt; unsigned long max; @@ -187,6 +188,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->util_cfs = cpu_util_cfs(rq); sg_cpu->util_dl = cpu_util_dl(rq); + sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util_rt = cpu_util_rt(rq); } @@ -198,20 +200,29 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; - util = sg_cpu->util_dl; - util += sg_cpu->util_cfs; + util = sg_cpu->util_cfs; util += sg_cpu->util_rt; + if ((util + sg_cpu->util_dl) >= sg_cpu->max) + return sg_cpu->max; + /* - * Utilization required by DEADLINE must always be granted while, for - * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to - * gracefully reduce the frequency when no tasks show up for longer + * As there is still idle time on the CPU, we need to compute the + * utilization level of the CPU. + * + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ + + /* Add DL bandwidth requirement */ + util += sg_cpu->bw_dl; + return min(sg_cpu->max, util); } @@ -367,7 +378,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { - if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) sg_policy->need_freq_update = true; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab8b5296b5f6..9028f268f867 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2199,11 +2199,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long cpu_util_dl(struct rq *rq) +static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; } +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + static inline unsigned long cpu_util_cfs(struct rq *rq) { unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); -- cgit v1.2.3 From 91c27493e78df6849baaa21a9d66e26de8b875c0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:09 +0200 Subject: sched/irq: Add IRQ utilization tracking interrupt and steal time are the only remaining activities tracked by rt_avg. Like for sched classes, we can use PELT to track their average utilization of the CPU. But unlike sched class, we don't track when entering/leaving interrupt; Instead, we take into account the time spent under interrupt context when we update rqs' clock (rq_clock_task). This also means that we have to decay the normal context time and account for interrupt time during the update. That's also important to note that because: rq_clock == rq_clock_task + interrupt time and rq_clock_task is used by a sched class to compute its utilization, the util_avg of a sched class only reflects the utilization of the time spent in normal context and not of the whole time of the CPU. The utilization of interrupt gives an more accurate level of utilization of CPU. The CPU utilization is: avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq Most of the time, avg_irq is small and neglictible so the use of the approximation CPU utilization = /Sum avg_rq was enough. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- kernel/sched/fair.c | 13 ++++++++++--- kernel/sched/pelt.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 16 ++++++++++++++++ kernel/sched/sched.h | 3 +++ 5 files changed, 72 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..38107a95baca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -17,6 +17,8 @@ #include "../workqueue_internal.h" #include "../smpboot.h" +#include "pelt.h" + #define CREATE_TRACE_POINTS #include @@ -185,7 +187,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) - sched_rt_avg_update(rq, irq_delta + steal); + update_irq_load_avg(rq, irq_delta + steal); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f096275c7df2..c2782b29c79f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7290,7 +7290,7 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) return false; } -static inline bool others_rqs_have_blocked(struct rq *rq) +static inline bool others_have_blocked(struct rq *rq) { if (READ_ONCE(rq->avg_rt.util_avg)) return true; @@ -7298,6 +7298,11 @@ static inline bool others_rqs_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if (READ_ONCE(rq->avg_irq.util_avg)) + return true; +#endif + return false; } @@ -7362,8 +7367,9 @@ static void update_blocked_averages(int cpu) } update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ - if (others_rqs_have_blocked(rq)) + if (others_have_blocked(rq)) done = false; #ifdef CONFIG_NO_HZ_COMMON @@ -7432,9 +7438,10 @@ static inline void update_blocked_averages(int cpu) update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq)) + if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) rq->has_blocked_load = 0; #endif rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 8b78b6320cda..ead6d8b4a8b8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -357,3 +357,43 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +/* + * irq: + * + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked + * util_sum = cpu_scale * load_sum + * runnable_load_sum = load_sum + * + */ + +int update_irq_load_avg(struct rq *rq, u64 running) +{ + int ret = 0; + /* + * We know the time that has been used by interrupt since last update + * but we don't when. Let be pessimistic and assume that interrupt has + * happened just before the update. This is not so far from reality + * because interrupt will most probably wake up task and trig an update + * of rq clock during which the metric si updated. + * We start to decay with normal context time and then we add the + * interrupt context time. + * We can safely remove running from rq->clock because + * rq->clock += delta with delta >= running + */ + ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + 0, + 0, + 0); + ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + 1, + 1, + 1); + + if (ret) + ___update_load_avg(&rq->avg_irq, 1, 1); + + return ret; +} +#endif diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 0e4f912461ad..d2894db28955 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,6 +6,16 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +int update_irq_load_avg(struct rq *rq, u64 running); +#else +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} +#endif + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. @@ -51,6 +61,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { return 0; } + +static inline int +update_irq_load_avg(struct rq *rq, u64 running) +{ + return 0; +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9028f268f867..b26d0c9948dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -857,6 +857,9 @@ struct rq { u64 age_stamp; struct sched_avg avg_rt; struct sched_avg avg_dl; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + struct sched_avg avg_irq; +#endif u64 idle_stamp; u64 avg_idle; -- cgit v1.2.3 From 9033ea11889f88f243445495f72441e22256d5e9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:10 +0200 Subject: cpufreq/schedutil: Take time spent in interrupts into account The time spent executing IRQ handlers can be significant but it is not reflected in the utilization of CPU when deciding to choose an OPP. Now that we have access to this metric, schedutil can take it into account when selecting the OPP for a CPU. RQS utilization don't see the time spend under interrupt context and report their value in the normal context time window. We need to compensate this when adding interrupt utilization The CPU utilization is: IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg A test with iperf on hikey (octo arm64) gives the following speedup: iperf -c server_address -r -t 5 w/o patch w/ patch Tx 276 Mbits/sec 304 Mbits/sec +10% Rx 299 Mbits/sec 328 Mbits/sec +9% 8 iterations stdev is lower than 1% Only WFI idle state is enabled (shallowest idle state). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 25 +++++++++++++++++++++---- kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 07760bc7f69a..7016bde9d194 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -58,6 +58,7 @@ struct sugov_cpu { unsigned long util_dl; unsigned long bw_dl; unsigned long util_rt; + unsigned long util_irq; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -190,21 +191,30 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->util_dl = cpu_util_dl(rq); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util_rt = cpu_util_rt(rq); + sg_cpu->util_irq = cpu_util_irq(rq); } static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util; + unsigned long util, max = sg_cpu->max; if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; + if (unlikely(sg_cpu->util_irq >= max)) + return max; + + /* Sum rq utilization */ util = sg_cpu->util_cfs; util += sg_cpu->util_rt; - if ((util + sg_cpu->util_dl) >= sg_cpu->max) - return sg_cpu->max; + /* + * Interrupt time is not seen by RQS utilization so we can compare + * them with the CPU capacity + */ + if ((util + sg_cpu->util_dl) >= max) + return max; /* * As there is still idle time on the CPU, we need to compute the @@ -220,10 +230,17 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) * ready for such an interface. So, we only do the latter for now. */ + /* Weight RQS utilization to normal context window */ + util *= (max - sg_cpu->util_irq); + util /= max; + + /* Add interrupt utilization */ + util += sg_cpu->util_irq; + /* Add DL bandwidth requirement */ util += sg_cpu->bw_dl; - return min(sg_cpu->max, util); + return min(max, util); } /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b26d0c9948dd..b2833e2b4b6a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2228,4 +2228,17 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return rq->avg_rt.util_avg; } + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +#endif #endif -- cgit v1.2.3 From dfa444dc2ff62edbaf1ff95ed22dd2ce8a5715da Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:11 +0200 Subject: sched/cpufreq: Remove sugov_aggregate_util() There is no reason why sugov_get_util() and sugov_aggregate_util() were in fact separate functions. Signed-off-by: Vincent Guittot [ Rebased after adding irq tracking and fixed some compilation errors. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/1530200714-4504-9-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 44 ++++++++++++++-------------------------- kernel/sched/sched.h | 2 +- 2 files changed, 16 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7016bde9d194..c9622b3f183d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,12 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost_max; u64 last_update; - /* The fields below are only needed when sharing a policy: */ - unsigned long util_cfs; - unsigned long util_dl; unsigned long bw_dl; - unsigned long util_rt; - unsigned long util_irq; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -182,38 +177,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util, irq, max; - sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->util_cfs = cpu_util_cfs(rq); - sg_cpu->util_dl = cpu_util_dl(rq); - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util_rt = cpu_util_rt(rq); - sg_cpu->util_irq = cpu_util_irq(rq); -} - -static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) -{ - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, max = sg_cpu->max; + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); if (rt_rq_is_runnable(&rq->rt)) - return sg_cpu->max; + return max; + + irq = cpu_util_irq(rq); - if (unlikely(sg_cpu->util_irq >= max)) + if (unlikely(irq >= max)) return max; /* Sum rq utilization */ - util = sg_cpu->util_cfs; - util += sg_cpu->util_rt; + util = cpu_util_cfs(rq); + util += cpu_util_rt(rq); /* * Interrupt time is not seen by RQS utilization so we can compare * them with the CPU capacity */ - if ((util + sg_cpu->util_dl) >= max) + if ((util + cpu_util_dl(rq)) >= max) return max; /* @@ -231,11 +219,11 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) */ /* Weight RQS utilization to normal context window */ - util *= (max - sg_cpu->util_irq); + util *= (max - irq); util /= max; /* Add interrupt utilization */ - util += sg_cpu->util_irq; + util += irq; /* Add DL bandwidth requirement */ util += sg_cpu->bw_dl; @@ -418,9 +406,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - sugov_get_util(sg_cpu); + util = sugov_get_util(sg_cpu); max = sg_cpu->max; - util = sugov_aggregate_util(sg_cpu); sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -459,9 +446,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - sugov_get_util(j_sg_cpu); + j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - j_util = sugov_aggregate_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); if (j_util * max > j_max * util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2833e2b4b6a..061d51fb5b44 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2226,7 +2226,7 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) static inline unsigned long cpu_util_rt(struct rq *rq) { - return rq->avg_rt.util_avg; + return READ_ONCE(rq->avg_rt.util_avg); } #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) -- cgit v1.2.3 From 523e979d31648112bad07f427c183525c0258c75 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:12 +0200 Subject: sched/core: Use PELT for scale_rt_capacity() The utilization of the CPU by RT, DL and IRQs are now tracked with PELT so we can use these metrics instead of rt_avg to evaluate the remaining capacity available for CFS class. scale_rt_capacity() behavior has been changed and now returns the remaining capacity available for CFS instead of a scaling factor because RT, DL and IRQ provide now absolute utilization value. The same formula as schedutil is used: IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg but the implementation is different because it doesn't return the same value and doesn't benefit of the same optimization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-10-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- kernel/sched/fair.c | 44 ++++++++++++++++++++++---------------------- kernel/sched/pelt.c | 2 +- kernel/sched/rt.c | 2 -- 4 files changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f4de26982d80..68b8a9f1c9ca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1180,8 +1180,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (dl_entity_is_special(dl_se)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c2782b29c79f..d265fa9756a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7551,39 +7551,39 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; - - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long used, free; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + unsigned long irq; +#endif - if (unlikely(delta < 0)) - delta = 0; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + irq = READ_ONCE(rq->avg_irq.util_avg); - total = sched_avg_period() + delta; + if (unlikely(irq >= max)) + return 1; +#endif - used = div_u64(avg, total); + used = READ_ONCE(rq->avg_rt.util_avg); + used += READ_ONCE(rq->avg_dl.util_avg); - if (likely(used < SCHED_CAPACITY_SCALE)) - return SCHED_CAPACITY_SCALE - used; + if (unlikely(used >= max)) + return 1; - return 1; + free = max - used; +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + free *= (max - irq); + free /= max; +#endif + return free; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index ead6d8b4a8b8..35475c0c5419 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,7 +237,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna */ sa->load_avg = div_u64(load * sa->load_sum, divider); sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); - sa->util_avg = sa->util_sum / divider; + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0dc8ad1915e6..2df72abfa24a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -973,8 +973,6 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (!rt_bandwidth_enabled()) return; -- cgit v1.2.3 From bbb62c0b024a1c721232667fa1d625cf6b3a555b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:13 +0200 Subject: sched/core: Remove the rt_avg code rt_avg is not used anywhere anymore, so we can remove all related code. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 -------------------------- kernel/sched/fair.c | 2 -- kernel/sched/sched.h | 17 ----------------- 3 files changed, 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 38107a95baca..a691b07390ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -651,23 +651,6 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -5716,13 +5699,6 @@ void set_rq_offline(struct rq *rq) } } -static void set_cpu_rq_start_time(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - rq->age_stamp = sched_clock_cpu(cpu); -} - /* * used to mark begin/end of suspend/resume: */ @@ -5840,7 +5816,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { - set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -6108,7 +6083,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d265fa9756a2..d5f7d521e448 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5323,8 +5323,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } - - sched_avg_update(this_rq); } /* Used instead of source_load when we know the type == 0 */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 061d51fb5b44..14aac2d2de80 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -853,8 +853,6 @@ struct rq { struct list_head cfs_tasks; - u64 rt_avg; - u64 age_stamp; struct sched_avg avg_rt; struct sched_avg avg_dl; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) @@ -1719,11 +1717,6 @@ extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - #ifdef CONFIG_SCHED_HRTICK /* @@ -1760,8 +1753,6 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1772,12 +1763,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif - -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); - sched_avg_update(rq); -} #else #ifndef arch_scale_cpu_capacity static __always_inline @@ -1786,8 +1771,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) return SCHED_CAPACITY_SCALE; } #endif -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } #endif struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -- cgit v1.2.3 From 5fd778915ad29184a5ff8eb82d1118f6916b79e4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 28 Jun 2018 17:45:14 +0200 Subject: sched/sysctl: Remove unused sched_time_avg_ms sysctl /proc/sys/kernel/sched_time_avg_ms entry is not used anywhere, remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luis R. Rodriguez Cc: Kees Cook Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 -------- kernel/sched/sched.h | 1 - kernel/sysctl.c | 8 -------- 3 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a691b07390ab..ba6bb805693a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -46,14 +46,6 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; -/* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - /* * period over which we measure -rt task CPU usage in us. * default: 1s diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14aac2d2de80..ebb4b3c3ece7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1713,7 +1713,6 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d9837c0aff4..f22f76b7a138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_time_avg_ms", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit v1.2.3 From 45f5519ec55e75af3565dd737586d3b041834f71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jul 2018 14:36:17 +0200 Subject: sched/cpufreq: Clarify sugov_get_util() Add a few comments to (hopefully) clarifying some of the magic in sugov_get_util(). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Link: http://lkml.kernel.org/r/20180705123617.GM2458@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 75 +++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c9622b3f183d..97dcd4472a0e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); @@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(&rq->rt)) return max; + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) return max; - /* Sum rq utilization */ + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ util = cpu_util_cfs(rq); util += cpu_util_rt(rq); /* - * Interrupt time is not seen by RQS utilization so we can compare - * them with the CPU capacity + * We do not make cpu_util_dl() a permanent part of this sum because we + * want to use cpu_bw_dl() later on, but we need to check if the + * CFS+RT+DL sum is saturated (ie. no idle time) such that we select + * f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. */ if ((util + cpu_util_dl(rq)) >= max) return max; /* - * As there is still idle time on the CPU, we need to compute the - * utilization level of the CPU. + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: * + * 1 - irq + * U' = irq + ------- * U + * max + */ + util *= (max - irq); + util /= max; + util += irq; + + /* * Bandwidth required by DEADLINE must always be granted while, for * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * - * Ideally we would like to set util_dl as min/guaranteed freq and - * util_cfs + util_dl as requested freq. However, cpufreq is not yet - * ready for such an interface. So, we only do the latter for now. + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. */ - - /* Weight RQS utilization to normal context window */ - util *= (max - irq); - util /= max; - - /* Add interrupt utilization */ - util += irq; - - /* Add DL bandwidth requirement */ - util += sg_cpu->bw_dl; - - return min(max, util); + return min(max, util + sg_cpu->bw_dl); } /** -- cgit v1.2.3 From af0fffd9300b97d8875aa745bc78e2f6fdb3c1f0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 6 Jul 2018 15:06:15 +0200 Subject: sched/core: Remove get_cpu() from sched_fork() get_cpu() disables preemption for the entire sched_fork() function. This get_cpu() was introduced in commit: dd41f596cda0 ("sched: cfs core code") ... which also invoked sched_balance_self() and this function required preemption do be off. Today, sched_balance_self() seems to be moved to ->task_fork callback which is invoked while the ->pi_lock is held. set_load_weight() could invoke reweight_task() which then via $callchain might end up in smp_processor_id() but since `update_load' is false this won't happen. I didn't find any this_cpu*() or similar usage during the initialisation of the task_struct. The `cpu' value (from get_cpu()) is only used later in __set_task_cpu() while the ->pi_lock lock is held. Based on this it is possible to remove get_cpu() and use smp_processor_id() for the `cpu' variable without breaking anything. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180706130615.g2ex2kmfu5kcvlq6@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ba6bb805693a..c3cf7d992159 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2294,7 +2294,6 @@ static inline void init_schedstats(void) {} int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; - int cpu = get_cpu(); __sched_fork(clone_flags, p); /* @@ -2330,14 +2329,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) { - put_cpu(); + if (dl_prio(p->prio)) return -EAGAIN; - } else if (rt_prio(p->prio)) { + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; - } else { + else p->sched_class = &fair_sched_class; - } init_entity_runnable_average(&p->se); @@ -2353,7 +2350,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). */ - __set_task_cpu(p, cpu); + __set_task_cpu(p, smp_processor_id()); if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2370,8 +2367,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif - - put_cpu(); return 0; } -- cgit v1.2.3 From 788faab70d5a882693286b8d5022779559c79904 Mon Sep 17 00:00:00 2001 From: Tobias Tefke Date: Mon, 9 Jul 2018 12:57:15 +0200 Subject: perf, tools: Use correct articles in comments Some of the comments in the perf events code use articles incorrectly, using 'a' for words beginning with a vowel sound, where 'an' should be used. Signed-off-by: Tobias Tefke Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/20180709105715.22938-1-tobias.tefke@tutanota.com [ Fix a few more perf related 'a event' typo fixes from all around the kernel and tooling tree. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 ++++++++-------- kernel/events/uprobes.c | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a4d5ac6d74af..86b31e5a5a9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1656,7 +1656,7 @@ perf_event_groups_next(struct perf_event *event) typeof(*event), group_node)) /* - * Add a event from the lists for its context. + * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -1844,7 +1844,7 @@ static void perf_group_attach(struct perf_event *event) } /* - * Remove a event from the lists for its context. + * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -2148,7 +2148,7 @@ static void __perf_event_disable(struct perf_event *event, } /* - * Disable a event. + * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2677,7 +2677,7 @@ static void __perf_event_enable(struct perf_event *event, } /* - * Enable a event. + * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2755,7 +2755,7 @@ static int __perf_event_stop(void *info) * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * - * Since this is happening on a event-local CPU, no trace is lost + * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) @@ -4827,7 +4827,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) int ret; /* - * Return end-of-file for a read on a event that is in + * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ @@ -9898,7 +9898,7 @@ enabled: } /* - * Allocate and initialize a event structure + * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, @@ -11229,7 +11229,7 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * Inherit a event from parent task to child task. + * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ccc579a7d32e..aed1ba569954 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -918,7 +918,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * EXPORT_SYMBOL_GPL(uprobe_register); /* - * uprobe_apply - unregister a already registered probe. + * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @uc: consumer which wants to add more or remove some breakpoints @@ -947,7 +947,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, } /* - * uprobe_unregister - unregister a already registered probe. + * uprobe_unregister - unregister an already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @uc: identify which probe if multiple probes are colocated. @@ -1403,7 +1403,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri) /* * Called with no locks held. - * Called in context of a exiting or a exec-ing thread. + * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { -- cgit v1.2.3 From a210fd32a46bae6d05b43860fe3b47732501d63b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 13 Jul 2018 14:05:57 -0400 Subject: kexec: add call to LSM hook in original kexec_load syscall In order for LSMs and IMA-appraisal to differentiate between kexec_load and kexec_file_load syscalls, both the original and new syscalls must call an LSM hook. This patch adds a call to security_kernel_load_data() in the original kexec_load syscall. Signed-off-by: Mimi Zohar Cc: Eric Biederman Cc: Kees Cook Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/kexec.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index aed8fb2564b3..68559808fdfa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -195,10 +196,17 @@ out: static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { + int result; + /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; + /* Permit LSMs and IMA to fail the kexec */ + result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + if (result < 0) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. -- cgit v1.2.3 From c77b8cdf745d91eca138e7bfa430dc6640b604a0 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 13 Jul 2018 14:06:02 -0400 Subject: module: replace the existing LSM hook in init_module Both the init_module and finit_module syscalls call either directly or indirectly the security_kernel_read_file LSM hook. This patch replaces the direct call in init_module with a call to the new security_kernel_load_data hook and makes the corresponding changes in SELinux, LoadPin, and IMA. Signed-off-by: Mimi Zohar Cc: Jeff Vander Stoep Cc: Casey Schaufler Cc: Kees Cook Acked-by: Jessica Yu Acked-by: Paul Moore Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f475f30eed8c..a7615d661910 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2876,7 +2876,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_read_file(NULL, READING_MODULE); + err = security_kernel_load_data(LOADING_MODULE); if (err) return err; -- cgit v1.2.3 From 76e079fefc8f62bd9b2cd2950814d1ee806e31a5 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:01 -0700 Subject: sched/core: Use smp_mb() in wake_woken_function() wake_woken_function() synchronizes with wait_woken() as follows: [wait_woken] [wake_woken_function] entry->flags &= ~wq_flag_woken; condition = true; smp_mb(); smp_wmb(); if (condition) wq_entry->flags |= wq_flag_woken; break; This commit replaces the above smp_wmb() with an smp_mb() in order to guarantee that either wait_woken() sees the wait condition being true or the store to wq_entry->flags in woken_wake_function() follows the store in wait_woken() in the coherence order (so that the former can eventually be observed by wait_woken()). The commit also fixes a comment associated to set_current_state() in wait_woken(): the comment pairs the barrier in set_current_state() to the above smp_wmb(), while the actual pairing involves the barrier in set_current_state() and the barrier executed by the try_to_wake_up() in wake_woken_function(). Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akiyks@gmail.com Cc: boqun.feng@gmail.com Cc: dhowells@redhat.com Cc: j.alglave@ucl.ac.uk Cc: linux-arch@vger.kernel.org Cc: luc.maranget@inria.fr Cc: npiggin@gmail.com Cc: parri.andrea@gmail.com Cc: stern@rowland.harvard.edu Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20180716180605.16115-10-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 928be527477e..a7a2aaa3026a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,35 +392,36 @@ static inline bool is_kthread_should_stop(void) * if (condition) * break; * - * p->state = mode; condition = true; - * smp_mb(); // A smp_wmb(); // C - * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; - * schedule() try_to_wake_up(); - * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; - * smp_mb() // B smp_wmb(); // C - * wq_entry->flags |= WQ_FLAG_WOKEN; - * } - * remove_wait_queue(&wq_head, &wait); + * // in wait_woken() // in woken_wake_function() * + * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN; + * smp_mb(); // A try_to_wake_up(): + * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) + * schedule() if (p->state & mode) + * p->state = TASK_RUNNING; p->state = TASK_RUNNING; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~ + * smp_mb(); // B condition = true; + * } smp_mb(); // C + * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; */ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { - set_current_state(mode); /* A */ /* - * The above implies an smp_mb(), which matches with the smp_wmb() from - * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must - * also observe all state before the wakeup. + * The below executes an smp_mb(), which matches with the full barrier + * executed by the try_to_wake_up() in woken_wake_function() such that + * either we see the store to wq_entry->flags in woken_wake_function() + * or woken_wake_function() sees our store to current->state. */ + set_current_state(mode); /* A */ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); /* - * The below implies an smp_mb(), it too pairs with the smp_wmb() from - * woken_wake_function() such that we must either observe the wait - * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss - * an event. + * The below executes an smp_mb(), which matches with the smp_mb() (C) + * in woken_wake_function() such that either we see the wait condition + * being true or the store to wq_entry->flags in woken_wake_function() + * follows ours in the coherence order. */ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ @@ -430,14 +431,8 @@ EXPORT_SYMBOL(wait_woken); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - /* - * Although this function is called under waitqueue lock, LOCK - * doesn't imply write barrier and the users expects write - * barrier semantics on wakeup functions. The following - * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with smp_store_mb() in wait_woken(). - */ - smp_wmb(); /* C */ + /* Pairs with the smp_store_mb() in wait_woken(). */ + smp_mb(); /* C */ wq_entry->flags |= WQ_FLAG_WOKEN; return default_wake_function(wq_entry, mode, sync, key); -- cgit v1.2.3 From 3d85b2703783636366560c94842affd8608ec9d1 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:02 -0700 Subject: locking/spinlock, sched/core: Clarify requirements for smp_mb__after_spinlock() There are 11 interpretations of the requirements described in the header comment for smp_mb__after_spinlock(): one for each LKMM maintainer, and one currently encoded in the Cat file. Stick to the latter (until a more satisfactory solution is available). This also reworks some snippets related to the barrier to illustrate the requirements and to link them to the idioms which are relied upon at its call sites. Suggested-by: Boqun Feng Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Will Deacon Cc: akiyks@gmail.com Cc: dhowells@redhat.com Cc: j.alglave@ucl.ac.uk Cc: linux-arch@vger.kernel.org Cc: luc.maranget@inria.fr Cc: npiggin@gmail.com Cc: parri.andrea@gmail.com Cc: stern@rowland.harvard.edu Link: http://lkml.kernel.org/r/20180716180605.16115-11-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..0c5ec2abdf93 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1998,21 +1998,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * be possible to, falsely, observe p->on_rq == 0 and get stuck * in smp_cond_load_acquire() below. * - * sched_ttwu_pending() try_to_wake_up() - * [S] p->on_rq = 1; [L] P->state - * UNLOCK rq->lock -----. - * \ - * +--- RMB - * schedule() / - * LOCK rq->lock -----' - * UNLOCK rq->lock + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock * * [task p] - * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq * - * Pairs with the UNLOCK+LOCK on rq->lock from the - * last wakeup of our task and the schedule that got our task - * current. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) @@ -2026,15 +2025,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * One must be running (->on_cpu == 1) in order to remove oneself * from the runqueue. * - * [S] ->on_cpu = 1; [L] ->on_rq - * UNLOCK rq->lock - * RMB - * LOCK rq->lock - * [S] ->on_rq = 0; [L] ->on_cpu + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu * - * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock - * from the consecutive calls to schedule(); the first switching to our - * task, the second putting it to sleep. + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). */ smp_rmb(); -- cgit v1.2.3 From 7696f9910a9a40b8a952f57d3428515fabd2d889 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Mon, 16 Jul 2018 11:06:03 -0700 Subject: sched/Documentation: Update wake_up() & co. memory-barrier guarantees Both the implementation and the users' expectation [1] for the various wakeup primitives have evolved over time, but the documentation has not kept up with these changes: brings it into 2018. [1] http://lkml.kernel.org/r/20180424091510.GB4064@hirez.programming.kicks-ass.net Also applied feedback from Alan Stern. Suggested-by: Peter Zijlstra Signed-off-by: Andrea Parri Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Cc: Akira Yokosawa Cc: Alan Stern Cc: Boqun Feng Cc: Daniel Lustig Cc: David Howells Cc: Jade Alglave Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Luc Maranget Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: parri.andrea@gmail.com Link: http://lkml.kernel.org/r/20180716180605.16115-12-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 8 ++++---- kernel/sched/core.c | 30 ++++++++++++------------------ kernel/sched/wait.c | 8 ++++---- 3 files changed, 20 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index e426b0cb9ac6..a1ad5b7d5521 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -22,8 +22,8 @@ * * See also complete_all(), wait_for_completion() and related routines. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void complete(struct completion *x) { @@ -44,8 +44,8 @@ EXPORT_SYMBOL(complete); * * This will wake up all threads waiting on this particular completion event. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. * * Since complete_all() sets the completion of @x permanently to done * to allow multiple waiters to finish, a call to reinit_completion() diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c5ec2abdf93..a0065c84e73f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -412,8 +412,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). + * This cmpxchg() executes a full barrier, which pairs with the full + * barrier executed by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -441,8 +441,8 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. + * wake_up_process() executes a full barrier, which pairs with + * the queueing in wake_q_add() so as not to miss wakeups. */ wake_up_process(task); put_task_struct(task); @@ -1879,8 +1879,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * rq(c1)->lock (if not at the same time, then in that order). * C) LOCK of the rq(c1)->lock scheduling in task * - * Transitivity guarantees that B happens after A and C after B. - * Note: we only require RCpc transitivity. + * Release/acquire chaining guarantees that B happens after A and C after B. * Note: the CPU doing B need not be c0 or c1 * * Example: @@ -1942,16 +1941,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * UNLOCK rq(0)->lock * * - * However; for wakeups there is a second guarantee we must provide, namely we - * must observe the state that lead to our wakeup. That is, not only must our - * task observe its own prior state, it must also observe the stores prior to - * its wakeup. - * - * This means that any means of doing remote wakeups must order the CPU doing - * the wakeup against the CPU the task is going to end up running on. This, - * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). - * + * However, for wakeups there is a second guarantee we must provide, namely we + * must ensure that CONDITION=1 done by the caller can not be reordered with + * accesses to the task state; see try_to_wake_up() and set_current_state(). */ /** @@ -1967,6 +1959,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * Atomic against schedule() which would dequeue a task, also see * set_current_state(). * + * This function executes a full memory barrier before accessing the task + * state; see set_current_state(). + * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ @@ -2141,8 +2136,7 @@ out: * * Return: 1 if the process was woken up, 0 if it was already running. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * This function executes a full memory barrier before accessing the task state. */ int wake_up_process(struct task_struct *p) { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index a7a2aaa3026a..870f97b313e3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -134,8 +134,8 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) @@ -180,8 +180,8 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * * On UP it can prevent extra preemption. * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) -- cgit v1.2.3 From c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 16 Jul 2018 15:03:31 -0400 Subject: mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids The mm_struct always contains a cpumask bitmap, regardless of CONFIG_CPUMASK_OFFSTACK. That means the first step can be to simplify things, and simply have one bitmask at the end of the mm_struct for the mm_cpumask. This does necessitate moving everything else in mm_struct into an anonymous sub-structure, which can be randomized when struct randomization is enabled. The second step is to determine the correct size for the mm_struct slab object from the size of the mm_struct (excluding the CPU bitmap) and the size the cpumask. For init_mm we can simply allocate the maximum size this kernel is compiled for, since we only have one init_mm in the system, anyway. Pointer magic by Mike Galbraith, to evade -Wstringop-overflow getting confused by the dynamically sized array. Tested-by: Song Liu Signed-off-by: Rik van Riel Signed-off-by: Mike Galbraith Signed-off-by: Rik van Riel Acked-by: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-2-riel@surriel.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..5b64c1b8461e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2253,6 +2253,8 @@ static void sighand_ctor(void *data) void __init proc_caches_init(void) { + unsigned int mm_size; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -2269,15 +2271,16 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* - * FIXME! The "sizeof(struct mm_struct)" currently includes the - * whole struct cpumask for the OFFSTACK case. We could change - * this to *only* allocate as much of it as required by the - * maximum number of CPU's we can ever have. The cpumask_allocation - * is at the end of the structure, exactly for that reason. + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_cachep = kmem_cache_create_usercopy("mm_struct", - sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), -- cgit v1.2.3 From d91cfeb0aa79445fcfa9f523a5b57c5e9f4113ec Mon Sep 17 00:00:00 2001 From: RAGHU Halharvi Date: Tue, 17 Jul 2018 15:50:09 +0530 Subject: genirq: Remove redundant NULL pointer check in __free_irq() The NULL pointer check in __free_irq() triggers a 'dereference before NULL pointer check' warning in static code analysis. It turns out that the check is redundant because all callers have a NULL pointer check already. Remove it. Signed-off-by: RAGHU Halharvi Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180717102009.7708-1-raghuhack78@gmail.com --- kernel/irq/manage.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f8be33572a7..a66c58f91bff 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1570,9 +1570,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); - if (!desc) - return NULL; - mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); -- cgit v1.2.3 From c417fbce98722ad7e384caa8ba6f2e7c5f8672d9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Jun 2018 01:40:23 +0900 Subject: kbuild: move bin2c back to scripts/ from scripts/basic/ Commit 8370edea81e3 ("bin2c: move bin2c in scripts/basic") moved bin2c to the scripts/basic/ directory, incorrectly stating "Kexec wants to use bin2c and it wants to use it really early in the build process. See arch/x86/purgatory/ code in later patches." Commit bdab125c9301 ("Revert "kexec/purgatory: Add clean-up for purgatory directory"") and commit d6605b6bbee8 ("x86/build: Remove unnecessary preparation for purgatory") removed the redundant purgatory build magic entirely. That means that the move of bin2c was unnecessary in the first place. fixdep is the only host program that deserves to sit in the scripts/basic/ directory. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 04bc07c2b42a..7a63d567fdb5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -123,7 +123,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) -- cgit v1.2.3 From 290e44b7dd116cc61cf37b7ca0be13313bb11e37 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 17 Jul 2018 14:45:08 -0400 Subject: audit: use ktime_get_coarse_real_ts64() for timestamps Commit c72051d5778a ("audit: use ktime_get_coarse_ts64() for time access") converted audit's use of current_kernel_time64() to the new ktime_get_coarse_ts64() function. Unfortunately this resulted in incorrect timestamps, e.g. events stamped with the year 1969 despite it being 2018. This patch corrects this by using ktime_get_coarse_real_ts64() just like the current_kernel_time64() wrapper. Fixes: c72051d5778a ("audit: use ktime_get_coarse_ts64() for time access") Reviewed-by: Arnd Bergmann Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e17bc697d11c..2a8058764aa6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1721,7 +1721,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_coarse_ts64(t); + ktime_get_coarse_real_ts64(t); *serial = audit_serial(); } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f6a0cb32d76e..fb207466e99b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1543,7 +1543,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->in_syscall = 1; context->current_state = state; context->ppid = 0; - ktime_get_coarse_ts64(&context->ctime); + ktime_get_coarse_real_ts64(&context->ctime); } /** -- cgit v1.2.3 From d29ab6e1fa21ebc2a8a771015dd9e0e5d4e28dc1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:10 -0700 Subject: bpf: bpf_prog_array_alloc() should return a generic non-rcu pointer Currently the return type of the bpf_prog_array_alloc() is struct bpf_prog_array __rcu *, which is not quite correct. Obviously, the returned pointer is a generic pointer, which is valid for an indefinite amount of time and it's not shared with anyone else, so there is no sense in marking it as __rcu. This change eliminate the following sparse warnings: kernel/bpf/core.c:1544:31: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1544:31: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1544:31: got void * kernel/bpf/core.c:1548:17: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1548:17: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1548:17: got struct bpf_prog_array * kernel/bpf/core.c:1681:15: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1681:15: expected struct bpf_prog_array *array kernel/bpf/core.c:1681:15: got struct bpf_prog_array [noderef] * Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e5625d46414..253aa8e79c7b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1538,7 +1538,7 @@ static struct { .null_prog = NULL, }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + -- cgit v1.2.3 From 3960f4fd6585608e8cc285d9665821985494e147 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:11 -0700 Subject: bpf: fix rcu annotations in compute_effective_progs() The progs local variable in compute_effective_progs() is marked as __rcu, which is not correct. This is a local pointer, which is initialized by bpf_prog_array_alloc(), which also now returns a generic non-rcu pointer. The real rcu-protected pointer is *array (array is a pointer to an RCU-protected pointer), so the assignment should be performed using rcu_assign_pointer(). Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3d83ee7df381..badabb0b435c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -95,7 +95,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { - struct bpf_prog_array __rcu *progs; + struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int cnt = 0; @@ -120,13 +120,12 @@ static int compute_effective_progs(struct cgroup *cgrp, &p->bpf.progs[type], node) { if (!pl->prog) continue; - rcu_dereference_protected(progs, 1)-> - progs[cnt++] = pl->prog; + progs->progs[cnt++] = pl->prog; } p = cgroup_parent(p); } while (p); - *array = progs; + rcu_assign_pointer(*array, progs); return 0; } -- cgit v1.2.3 From c23e014a4ba1a9448cbbd74916377f23a7da2fc2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 09:38:59 +0100 Subject: bpf: sockmap: remove redundant pointer sg Pointer sg is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'sg' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf7b6a6dbd1f..3e7a5f622712 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -719,11 +719,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, { bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; - struct scatterlist *sg; int err = 0; - sg = md->sg_data; - rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) -- cgit v1.2.3 From 09728266b6f99ab57cd4f84f3eead65b7b65dbf7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:23 -0700 Subject: bpf: offload: rename bpf_offload_dev_match() to bpf_offload_prog_map_match() A set of new API functions exported for the drivers will soon use 'bpf_offload_dev_' as a prefix. Rename the bpf_offload_dev_match() which is internal to the core (used by the verifier) to avoid any confusion. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 2 +- kernel/bpf/verifier.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..6184e48703f4 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -468,7 +468,7 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; struct bpf_prog_offload *offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..15d69b278277 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5054,7 +5054,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } -- cgit v1.2.3 From 9fd7c5559165f4c679b40c5e6ad442955832dfad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:24 -0700 Subject: bpf: offload: aggregate offloads per-device Currently we have two lists of offloaded objects - programs and maps. Netdevice unregister notifier scans those lists to orphan objects associated with device being unregistered. This puts unnecessary (even if negligible) burden on all netdev unregister calls in BPF- -enabled kernel. The lists of objects may potentially get long making the linear scan even more problematic. There haven't been complaints about this mechanisms so far, but it is suboptimal. Instead of relying on notifiers, make the few BPF-capable drivers register explicitly for BPF offloads. The programs and maps will now be collected per-device not on a global list, and only scanned for removal when driver unregisters from BPF offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 142 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6184e48703f4..cd64a26807aa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,37 @@ #include #include #include +#include #include #include #include +#include #include #include -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct list_head progs; + struct list_head maps; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +59,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +95,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +324,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +347,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -489,56 +526,69 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct net_device *netdev) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); -} + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return err; + offdevs_inited = true; + } + up_write(&bpf_devs_lock); -static void bpf_offload_orphan_all_maps(struct net_device *netdev) -{ - struct bpf_offloaded_map *offmap, *tmp; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) +void bpf_offload_dev_netdev_unregister(struct net_device *netdev) { - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + struct bpf_offload_netdev *ondev; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; - } - return NOTIFY_OK; -} + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); -static int __init bpf_offload_init(void) -{ - register_netdevice_notifier(&bpf_offload_notifier); - return 0; -} + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); -subsys_initcall(bpf_offload_init); + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -- cgit v1.2.3 From 602144c224604f1cbff02ee2d1cf46825269ecbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:25 -0700 Subject: bpf: offload: keep the offload state per-ASIC Create a higher-level entity to represent a device/ASIC to allow programs and maps to be shared between device ports. The extra work is required to make sure we don't destroy BPF objects as soon as the netdev for which they were loaded gets destroyed, as other ports may still be using them. When netdev goes away all of its BPF objects will be moved to other netdevs of the device, and only destroyed when last netdev is unregistered. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 81 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cd64a26807aa..925575f64ff1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,11 +32,17 @@ */ static DECLARE_RWSEM(bpf_devs_lock); +struct bpf_offload_dev { + struct list_head netdevs; +}; + struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; + struct bpf_offload_dev *offdev; struct list_head progs; struct list_head maps; + struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { @@ -526,25 +532,18 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -int bpf_offload_dev_netdev_register(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) - return err; - offdevs_inited = true; - } - up_write(&bpf_devs_lock); - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; + ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); @@ -555,6 +554,7 @@ int bpf_offload_dev_netdev_register(struct net_device *netdev) goto err_unlock_free; } + list_add(&ondev->offdev_netdevs, &offdev->netdevs); up_write(&bpf_devs_lock); return 0; @@ -565,11 +565,12 @@ err_unlock_free: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { + struct bpf_offload_netdev *ondev, *altdev; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; - struct bpf_offload_netdev *ondev; ASSERT_RTNL(); @@ -579,11 +580,26 @@ void bpf_offload_dev_netdev_unregister(struct net_device *netdev) goto unlock; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); + list_del(&ondev->offdev_netdevs); + + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); @@ -592,3 +608,34 @@ unlock: up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); + +struct bpf_offload_dev *bpf_offload_dev_create(void) +{ + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); + +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); -- cgit v1.2.3 From fd4f227dea0f24d89f52f7c4eb3207f84ddcbcbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:26 -0700 Subject: bpf: offload: allow program and map sharing per-ASIC Allow programs and maps to be re-used across different netdevs, as long as they belong to the same struct bpf_offload_dev. Update the bpf_offload_prog_map_match() helper for the verifier and export a new helper for the drivers to use when checking programs at attachment time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 925575f64ff1..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -511,22 +511,50 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); - down_read(&bpf_devs_lock); offload = prog->aux->offload; + if (!offload) + return false; + if (offload->netdev == netdev) + return true; + + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, netdev); + up_read(&bpf_devs_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); + +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + bool ret; + + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); - ret = offload && offload->netdev == offmap->netdev; + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; -- cgit v1.2.3 From baa2a4fdd525c8c4b0f704d20457195b29437839 Mon Sep 17 00:00:00 2001 From: Ronny Chevalier Date: Wed, 11 Jul 2018 14:39:37 +0200 Subject: audit: fix use-after-free in audit_add_watch audit_add_watch stores locally krule->watch without taking a reference on watch. Then, it calls audit_add_to_parent, and uses the watch stored locally. Unfortunately, it is possible that audit_add_to_parent updates krule->watch. When it happens, it also drops a reference of watch which could free the watch. How to reproduce (with KASAN enabled): auditctl -w /etc/passwd -F success=0 -k test_passwd auditctl -w /etc/passwd -F success=1 -k test_passwd2 The second call to auditctl triggers the use-after-free, because audit_to_parent updates krule->watch to use a previous existing watch and drops the reference to the newly created watch. To fix the issue, we grab a reference of watch and we release it at the end of the function. Signed-off-by: Ronny Chevalier Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6f249bdf2d84..787c7afdf829 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -420,6 +420,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) struct path parent_path; int h, ret = 0; + /* + * When we will be calling audit_add_to_parent, krule->watch might have + * been updated and watch might have been freed. + * So we need to keep a reference of watch. + */ + audit_get_watch(watch); + mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ @@ -428,8 +435,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) + if (ret) { + audit_put_watch(watch); return ret; + } /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); @@ -447,6 +456,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) *list = &audit_inode_hash[h]; error: path_put(&parent_path); + audit_put_watch(watch); return ret; } -- cgit v1.2.3 From 30587589251a00974115e0815ac316980f48dbb5 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 16 Jul 2018 14:08:57 +0800 Subject: timer: Fix coding style The call to wake_up_nohz_cpu() is incorrectly indented. Remove the surplus TAB. Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Reviewed-by: Jiang Biao Cc: john.stultz@linaro.org Cc: sboyd@kernel.org Cc: zhong.weidong@zte.com.cn CC: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/1531721337-30284-1-git-send-email-wang.yi59@zte.com.cn --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index cc2d23e6ff61..baa528a24a73 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -581,7 +581,7 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * wheel: */ base->next_expiry = timer->expires; - wake_up_nohz_cpu(base->cpu); + wake_up_nohz_cpu(base->cpu); } static void -- cgit v1.2.3 From 0f9987b63dcc68606b82a349bbb2016b392a2c34 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 13 Jul 2018 14:06:40 +0200 Subject: ntp: Remove redundant arguments The 'ts' argument of process_adj_status() and process_adjtimex_modes() is unused and can be safely removed. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Ondrej Mosnacek Signed-off-by: John Stultz --- kernel/time/ntp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 10a79053e82f..3eddac25675f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -642,7 +642,7 @@ void ntp_notify_cmos_timer(void) /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) +static inline void process_adj_status(struct timex *txc) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -665,12 +665,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) } -static inline void process_adjtimex_modes(struct timex *txc, - struct timespec64 *ts, - s32 *time_tai) +static inline void process_adjtimex_modes(struct timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) - process_adj_status(txc, ts); + process_adj_status(txc); if (txc->modes & ADJ_NANO) time_status |= STA_NANO; @@ -735,7 +733,7 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, ts, time_tai); + process_adjtimex_modes(txc, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); -- cgit v1.2.3 From 86b2dcd4f02b86f6b5927fc0adcac777825f4030 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 13 Jul 2018 14:06:41 +0200 Subject: ntp: Use kstrtos64 for s64 variable ...instead of kstrtol with a dirty cast. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Ondrej Mosnacek Signed-off-by: John Stultz --- kernel/time/ntp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3eddac25675f..a627cae8baab 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1020,12 +1020,11 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); - + int rc = kstrtos64(str, 0, &ntp_tick_adj); if (rc) return rc; - ntp_tick_adj <<= NTP_SCALE_SHIFT; + ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } -- cgit v1.2.3 From 3eca993740b8eb40f514b90b1877a4dbcf0a6710 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:34 -0400 Subject: timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset() If architecture does not support exact boot time, it is challenging to estimate boot time without having a reference to the current persistent clock value. Yet, it cannot read the persistent clock time again, because this may lead to math discrepancies with the caller of read_boot_clock64() who have read the persistent clock at a different time. This is why it is better to provide two values simultaneously: the persistent clock value, and the boot time. Replace read_boot_clock64() with: read_persistent_wall_and_boot_offset(wall_time, boot_offset) Where wall_time is returned by read_persistent_clock() And boot_offset is wall_time - boot time, which defaults to 0. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-16-pasha.tatashin@oracle.com --- kernel/time/timekeeping.c | 59 ++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4786df904c22..cb738f825c12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1496,18 +1497,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock64 - Return time of the system start. + * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset + * from the boot. * * Weak dummy function for arches that do not yet support it. - * Function to read the exact time the system has been started. - * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. + * wall_time - current time as returned by persistent clock + * boot_offset - offset that is defined as wall_time - boot_time + * default to 0. */ -void __weak read_boot_clock64(struct timespec64 *ts) +void __weak __init +read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, + struct timespec64 *boot_offset) { - ts->tv_sec = 0; - ts->tv_nsec = 0; + read_persistent_clock64(wall_time); + *boot_offset = (struct timespec64){0}; } /* Flag for if timekeeping_resume() has injected sleeptime */ @@ -1521,28 +1524,29 @@ static bool persistent_clock_exists; */ void __init timekeeping_init(void) { + struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec64 now, boot, tmp; - - read_persistent_clock64(&now); - if (!timespec64_valid_strict(&now)) { - pr_warn("WARNING: Persistent clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - now.tv_sec = 0; - now.tv_nsec = 0; - } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exists = true; - read_boot_clock64(&boot); - if (!timespec64_valid_strict(&boot)) { - pr_warn("WARNING: Boot clock returned invalid value!\n" - " Check your CMOS/BIOS settings.\n"); - boot.tv_sec = 0; - boot.tv_nsec = 0; + read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); + if (timespec64_valid_strict(&wall_time) && + timespec64_to_ns(&wall_time) > 0) { + persistent_clock_exists = true; + } else { + pr_warn("Persistent clock returned invalid value"); + wall_time = (struct timespec64){0}; } + if (timespec64_compare(&wall_time, &boot_offset) < 0) + boot_offset = (struct timespec64){0}; + + /* + * We want set wall_to_mono, so the following is true: + * wall time + wall_to_mono = boot time + */ + wall_to_mono = timespec64_sub(boot_offset, wall_time); + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); @@ -1552,13 +1556,10 @@ void __init timekeeping_init(void) clock->enable(clock); tk_setup_internals(tk, clock); - tk_set_xtime(tk, &now); + tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(tk); - set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); - tk_set_wall_to_mono(tk, tmp); + tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); -- cgit v1.2.3 From 4b1b7f8054896cee25669f6cea7cb6dd17f508f7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:35 -0400 Subject: timekeeping: Default boot time offset to local_clock() read_persistent_wall_and_boot_offset() is called during boot to read both the persistent clock and also return the offset between the boot time and the value of persistent clock. Change the default boot_offset from zero to local_clock() so architectures, that do not have a dedicated boot_clock but have early sched_clock(), such as SPARCv9, x86, and possibly more will benefit from this change by getting a better and more consistent estimate of the boot time without need for an arch specific implementation. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-17-pasha.tatashin@oracle.com --- kernel/time/timekeeping.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cb738f825c12..30d7f64ffc87 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1503,14 +1503,17 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) * Weak dummy function for arches that do not yet support it. * wall_time - current time as returned by persistent clock * boot_offset - offset that is defined as wall_time - boot_time - * default to 0. + * The default function calculates offset based on the current value of + * local_clock(). This way architectures that support sched_clock() but don't + * support dedicated boot time clock will provide the best estimate of the + * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); - *boot_offset = (struct timespec64){0}; + *boot_offset = ns_to_timespec64(local_clock()); } /* Flag for if timekeeping_resume() has injected sleeptime */ -- cgit v1.2.3 From 5d2a4e91a541cb04d20d11602f0f9340291322ac Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:41 -0400 Subject: sched/clock: Move sched clock initialization and merge with generic clock sched_clock_postinit() initializes a generic clock on systems where no other clock is provided. This function may be called only after timekeeping_init(). Rename sched_clock_postinit to generic_clock_inti() and call it from sched_clock_init(). Move the call for sched_clock_init() until after time_init(). Suggested-by: Peter Zijlstra Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-23-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 27 +++++++++++++++++---------- kernel/sched/core.c | 1 - kernel/time/sched_clock.c | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 10c83e73837a..0e9dbb2d9aea 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,6 +53,7 @@ * */ #include "sched.h" +#include /* * Scheduler clock - returns current time in nanosec units. @@ -68,11 +69,6 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * We must start with !__sched_clock_stable because the unstable -> stable @@ -199,6 +195,15 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } +static void __sched_clock_gtod_offset(void) +{ + __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); +} + +void __init sched_clock_init(void) +{ + sched_clock_running = 1; +} /* * We run this as late_initcall() such that it runs after all built-in drivers, * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. @@ -385,8 +390,6 @@ void sched_clock_tick(void) void sched_clock_tick_stable(void) { - u64 gtod, clock; - if (!sched_clock_stable()) return; @@ -398,9 +401,7 @@ void sched_clock_tick_stable(void) * TSC to be unstable, any computation will be computing crap. */ local_irq_disable(); - gtod = ktime_get_ns(); - clock = sched_clock(); - __gtod_offset = (clock + __sched_clock_offset) - gtod; + __sched_clock_gtod_offset(); local_irq_enable(); } @@ -434,6 +435,12 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +void __init sched_clock_init(void) +{ + sched_clock_running = 1; + generic_sched_clock_init(); +} + u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..552406e9713b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5954,7 +5954,6 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; - sched_clock_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 2d8f05aad442..cbc72c2c1fca 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -237,7 +237,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -void __init sched_clock_postinit(void) +void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, -- cgit v1.2.3 From 857baa87b6422bcfb84ed3631d6839920cb5b09d Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:42 -0400 Subject: sched/clock: Enable sched clock early Allow sched_clock() to be used before schec_clock_init() is called. This provides a way to get early boot timestamps on machines with unstable clocks. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-24-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 0e9dbb2d9aea..422cd63f8f17 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -202,7 +202,25 @@ static void __sched_clock_gtod_offset(void) void __init sched_clock_init(void) { + unsigned long flags; + + /* + * Set __gtod_offset such that once we mark sched_clock_running, + * sched_clock_tick() continues where sched_clock() left off. + * + * Even if TSC is buggered, we're still UP at this point so it + * can't really be out of sync. + */ + local_irq_save(flags); + __sched_clock_gtod_offset(); + local_irq_restore(flags); + sched_clock_running = 1; + + /* Now that sched_clock_running is set adjust scd */ + local_irq_save(flags); + sched_clock_tick(); + local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers, @@ -356,7 +374,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock() + __sched_clock_offset; if (unlikely(!sched_clock_running)) - return 0ull; + return sched_clock(); preempt_disable_notrace(); scd = cpu_sdc(cpu); -- cgit v1.2.3 From 46457ea464f5341d1f9dad8dd213805d45f7f117 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:43 -0400 Subject: sched/clock: Use static key for sched_clock_running sched_clock_running may be read every time sched_clock_cpu() is called. Yet, this variable is updated only twice during boot, and never changes again, therefore it is better to make it a static key. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-25-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 16 ++++++++-------- kernel/sched/debug.c | 2 -- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 422cd63f8f17..c5c47ad3f386 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -67,7 +67,7 @@ unsigned long long __weak sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -__read_mostly int sched_clock_running; +static DEFINE_STATIC_KEY_FALSE(sched_clock_running); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* @@ -191,7 +191,7 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ - if (sched_clock_running == 2) + if (static_key_count(&sched_clock_running.key) == 2) __clear_sched_clock_stable(); } @@ -215,7 +215,7 @@ void __init sched_clock_init(void) __sched_clock_gtod_offset(); local_irq_restore(flags); - sched_clock_running = 1; + static_branch_inc(&sched_clock_running); /* Now that sched_clock_running is set adjust scd */ local_irq_save(flags); @@ -228,7 +228,7 @@ void __init sched_clock_init(void) */ static int __init sched_clock_init_late(void) { - sched_clock_running = 2; + static_branch_inc(&sched_clock_running); /* * Ensure that it is impossible to not do a static_key update. * @@ -373,7 +373,7 @@ u64 sched_clock_cpu(int cpu) if (sched_clock_stable()) return sched_clock() + __sched_clock_offset; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return sched_clock(); preempt_disable_notrace(); @@ -396,7 +396,7 @@ void sched_clock_tick(void) if (sched_clock_stable()) return; - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return; lockdep_assert_irqs_disabled(); @@ -455,13 +455,13 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); void __init sched_clock_init(void) { - sched_clock_running = 1; + static_branch_inc(&sched_clock_running); generic_sched_clock_init(); } u64 sched_clock_cpu(int cpu) { - if (unlikely(!sched_clock_running)) + if (!static_branch_unlikely(&sched_clock_running)) return 0; return sched_clock(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..b0212f489a33 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -623,8 +623,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } -extern __read_mostly int sched_clock_running; - static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); -- cgit v1.2.3 From 985e695074d35768cb04d65f58bca45f7bf1a99d Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 13 Jul 2018 14:06:42 +0200 Subject: timekeeping/ntp: Constify some function arguments Add 'const' to some function arguments and variables to make it easier to read the code. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Signed-off-by: Ondrej Mosnacek [jstultz: Also fixup pre-existing checkpatch warnings for prototype arguments with no variable name] Signed-off-by: John Stultz --- kernel/time/ntp.c | 6 +++--- kernel/time/ntp_internal.h | 4 ++-- kernel/time/timekeeping.c | 29 +++++++++++++++-------------- kernel/time/timekeeping_debug.c | 2 +- kernel/time/timekeeping_internal.h | 2 +- 5 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a627cae8baab..c5e0cba3b39c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -642,7 +642,7 @@ void ntp_notify_cmos_timer(void) /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(struct timex *txc) +static inline void process_adj_status(const struct timex *txc) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -665,7 +665,7 @@ static inline void process_adj_status(struct timex *txc) } -static inline void process_adjtimex_modes(struct timex *txc, s32 *time_tai) +static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc); @@ -716,7 +716,7 @@ static inline void process_adjtimex_modes(struct timex *txc, s32 *time_tai) * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) { int result; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 909bd1f1bfb1..c24b0e13f011 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); -extern void __hardpps(const struct timespec64 *, const struct timespec64 *); +extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ad779c2ec53a..7033ac1fee65 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -105,7 +105,7 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static inline struct timespec64 tk_xtime(struct timekeeper *tk) +static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; @@ -162,7 +162,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(struct tk_read_base *tkr) +static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); @@ -211,7 +211,7 @@ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) } } -static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) { struct timekeeper *tk = &tk_core.timekeeper; u64 now, last, mask, max, delta; @@ -255,7 +255,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) { } -static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) +static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) { u64 cycle_now, delta; @@ -352,7 +352,7 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta) +static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) { u64 nsec; @@ -363,7 +363,7 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta) return nsec + arch_gettimeoffset(); } -static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { u64 delta; @@ -371,7 +371,7 @@ static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) return timekeeping_delta_to_ns(tkr, delta); } -static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles) +static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { u64 delta; @@ -394,7 +394,8 @@ static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles) * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ -static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) +static void update_fast_timekeeper(const struct tk_read_base *tkr, + struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; @@ -549,10 +550,10 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ -static void halt_fast_timekeeper(struct timekeeper *tk) +static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; - struct tk_read_base *tkr = &tk->tkr_mono; + const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); @@ -1277,7 +1278,7 @@ EXPORT_SYMBOL(do_settimeofday64); * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(struct timespec64 *ts) +static int timekeeping_inject_offset(const struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -1585,7 +1586,7 @@ static struct timespec64 timekeeping_suspend_time; * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, - struct timespec64 *delta) + const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING @@ -1646,7 +1647,7 @@ bool timekeeping_rtc_skipsuspend(void) * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ -void timekeeping_inject_sleeptime64(struct timespec64 *delta) +void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -2240,7 +2241,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /** * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(struct timex *txc) +static int timekeeping_validate_timex(const struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 0754cadfa9e6..238e4be60229 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -70,7 +70,7 @@ static int __init tk_debug_sleep_time_init(void) } late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec64 *t) +void tk_debug_account_sleep_time(const struct timespec64 *t) { /* Cap bin index so we don't overflow the array */ int bin = min(fls(t->tv_sec), NUM_BINS-1); diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index cf5c0828ee31..bcbb52db2256 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec64 *t); +extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else #define tk_debug_account_sleep_time(x) #endif -- cgit v1.2.3 From f473e5f467f6049370575390b08dc42131315d60 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 17 Jul 2018 12:01:29 +0530 Subject: time: Fix extra sleeptime injection when suspend fails Currently, there exists a corner case assuming when there is only one clocksource e.g RTC, and system failed to go to suspend mode. While resume rtc_resume() injects the sleeptime as timekeeping_rtc_skipresume() returned 'false' (default value of sleeptime_injected) due to which we can see mismatch in timestamps. This issue can also come in a system where more than one clocksource are present and very first suspend fails. Success case: ------------ {sleeptime_injected=false} rtc_suspend() => timekeeping_suspend() => timekeeping_resume() => (sleeptime injected) rtc_resume() Failure case: ------------ {failure in sleep path} {sleeptime_injected=false} rtc_suspend() => rtc_resume() {sleeptime injected again which was not required as the suspend failed} Fix this by handling the boolean logic properly. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Originally-by: Thomas Gleixner Signed-off-by: Mukesh Ojha Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7033ac1fee65..19414b16cf9e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1519,8 +1519,20 @@ void __weak read_boot_clock64(struct timespec64 *ts) ts->tv_nsec = 0; } -/* Flag for if timekeeping_resume() has injected sleeptime */ -static bool sleeptime_injected; +/* + * Flag reflecting whether timekeeping_resume() has injected sleeptime. + * + * The flag starts of false and is only set when a suspend reaches + * timekeeping_suspend(), timekeeping_resume() sets it to false when the + * timekeeper clocksource is not stopping across suspend and has been + * used to update sleep time. If the timekeeper clocksource has stopped + * then the flag stays true and is used by the RTC resume code to decide + * whether sleeptime must be injected and if so the flag gets false then. + * + * If a suspend fails before reaching timekeeping_resume() then the flag + * stays false and prevents erroneous sleeptime injection. + */ +static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; @@ -1619,7 +1631,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ bool timekeeping_rtc_skipresume(void) { - return sleeptime_injected; + return !suspend_timing_needed; } /** @@ -1655,6 +1667,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); + suspend_timing_needed = false; + timekeeping_forward_now(tk); __timekeeping_inject_sleeptime(tk, delta); @@ -1679,8 +1693,8 @@ void timekeeping_resume(void) unsigned long flags; struct timespec64 ts_new, ts_delta; u64 cycle_now; + bool inject_sleeptime = false; - sleeptime_injected = false; read_persistent_clock64(&ts_new); clockevents_resume(); @@ -1710,14 +1724,16 @@ void timekeeping_resume(void) tk->tkr_mono.mask); nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift); ts_delta = ns_to_timespec64(nsec); - sleeptime_injected = true; + inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); - sleeptime_injected = true; + inject_sleeptime = true; } - if (sleeptime_injected) + if (inject_sleeptime) { + suspend_timing_needed = false; __timekeeping_inject_sleeptime(tk, &ts_delta); + } /* Re-base the last cycle value */ tk->tkr_mono.cycle_last = cycle_now; @@ -1752,6 +1768,8 @@ int timekeeping_suspend(void) if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; + suspend_timing_needed = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); -- cgit v1.2.3 From 39232ed5a1793f67b11430c43ed8a9ed6e96c6eb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 17 Jul 2018 15:55:16 +0800 Subject: time: Introduce one suspend clocksource to compensate the suspend time On some hardware with multiple clocksources, we have coarse grained clocksources that support the CLOCK_SOURCE_SUSPEND_NONSTOP flag, but which are less than ideal for timekeeping whereas other clocksources can be better candidates but halt on suspend. Currently, the timekeeping core only supports timing suspend using CLOCK_SOURCE_SUSPEND_NONSTOP clocksources if that clocksource is the current clocksource for timekeeping. As a result, some architectures try to implement read_persistent_clock64() using those non-stop clocksources, but isn't really ideal, which will introduce more duplicate code. To fix this, provide logic to allow a registered SUSPEND_NONSTOP clocksource, which isn't the current clocksource, to be used to calculate the suspend time. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Cc: Daniel Lezcano Reviewed-by: Thomas Gleixner Reviewed-by: Daniel Lezcano Suggested-by: Thomas Gleixner Signed-off-by: Baolin Wang [jstultz: minor tweaks to merge with previous resume changes] Signed-off-by: John Stultz --- kernel/time/clocksource.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 22 ++++--- 2 files changed, 163 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f89a78e2792b..f74fb00d8064 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -94,6 +94,8 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]--------- * curr_clocksource: * currently selected clocksource. + * suspend_clocksource: + * used to calculate the suspend time. * clocksource_list: * linked list with the registered clocksources * clocksource_mutex: @@ -102,10 +104,12 @@ EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); * Name of the user-specified clocksource. */ static struct clocksource *curr_clocksource; +static struct clocksource *suspend_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); static char override_name[CS_NAME_LEN]; static int finished_booting; +static u64 suspend_start; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -447,6 +451,140 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ +static bool clocksource_is_suspend(struct clocksource *cs) +{ + return cs == suspend_clocksource; +} + +static void __clocksource_suspend_select(struct clocksource *cs) +{ + /* + * Skip the clocksource which will be stopped in suspend state. + */ + if (!(cs->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) + return; + + /* + * The nonstop clocksource can be selected as the suspend clocksource to + * calculate the suspend time, so it should not supply suspend/resume + * interfaces to suspend the nonstop clocksource when system suspends. + */ + if (cs->suspend || cs->resume) { + pr_warn("Nonstop clocksource %s should not supply suspend/resume interfaces\n", + cs->name); + } + + /* Pick the best rating. */ + if (!suspend_clocksource || cs->rating > suspend_clocksource->rating) + suspend_clocksource = cs; +} + +/** + * clocksource_suspend_select - Select the best clocksource for suspend timing + * @fallback: if select a fallback clocksource + */ +static void clocksource_suspend_select(bool fallback) +{ + struct clocksource *cs, *old_suspend; + + old_suspend = suspend_clocksource; + if (fallback) + suspend_clocksource = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_suspend) + continue; + + __clocksource_suspend_select(cs); + } +} + +/** + * clocksource_start_suspend_timing - Start measuring the suspend timing + * @cs: current clocksource from timekeeping + * @start_cycles: current cycles from timekeeping + * + * This function will save the start cycle values of suspend timer to calculate + * the suspend time when resuming system. + * + * This function is called late in the suspend process from timekeeping_suspend(), + * that means processes are freezed, non-boot cpus and interrupts are disabled + * now. It is therefore possible to start the suspend timer without taking the + * clocksource mutex. + */ +void clocksource_start_suspend_timing(struct clocksource *cs, u64 start_cycles) +{ + if (!suspend_clocksource) + return; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value as suspend_start to avoid same reading + * from suspend timer. + */ + if (clocksource_is_suspend(cs)) { + suspend_start = start_cycles; + return; + } + + if (suspend_clocksource->enable && + suspend_clocksource->enable(suspend_clocksource)) { + pr_warn_once("Failed to enable the non-suspend-able clocksource.\n"); + return; + } + + suspend_start = suspend_clocksource->read(suspend_clocksource); +} + +/** + * clocksource_stop_suspend_timing - Stop measuring the suspend timing + * @cs: current clocksource from timekeeping + * @cycle_now: current cycles from timekeeping + * + * This function will calculate the suspend time from suspend timer. + * + * Returns nanoseconds since suspend started, 0 if no usable suspend clocksource. + * + * This function is called early in the resume process from timekeeping_resume(), + * that means there is only one cpu, no processes are running and the interrupts + * are disabled. It is therefore possible to stop the suspend timer without + * taking the clocksource mutex. + */ +u64 clocksource_stop_suspend_timing(struct clocksource *cs, u64 cycle_now) +{ + u64 now, delta, nsec = 0; + + if (!suspend_clocksource) + return 0; + + /* + * If current clocksource is the suspend timer, we should use the + * tkr_mono.cycle_last value from timekeeping as current cycle to + * avoid same reading from suspend timer. + */ + if (clocksource_is_suspend(cs)) + now = cycle_now; + else + now = suspend_clocksource->read(suspend_clocksource); + + if (now > suspend_start) { + delta = clocksource_delta(now, suspend_start, + suspend_clocksource->mask); + nsec = mul_u64_u32_shr(delta, suspend_clocksource->mult, + suspend_clocksource->shift); + } + + /* + * Disable the suspend timer to save power if current clocksource is + * not the suspend timer. + */ + if (!clocksource_is_suspend(cs) && suspend_clocksource->disable) + suspend_clocksource->disable(suspend_clocksource); + + return nsec; +} + /** * clocksource_suspend - suspend the clocksource(s) */ @@ -792,6 +930,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_select(); clocksource_select_watchdog(false); + __clocksource_suspend_select(cs); mutex_unlock(&clocksource_mutex); return 0; } @@ -820,6 +959,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) clocksource_select(); clocksource_select_watchdog(false); + clocksource_suspend_select(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -845,6 +985,15 @@ static int clocksource_unbind(struct clocksource *cs) return -EBUSY; } + if (clocksource_is_suspend(cs)) { + /* + * Select and try to install a replacement suspend clocksource. + * If no replacement suspend clocksource, we will just let the + * clocksource go and have no suspend clocksource. + */ + clocksource_suspend_select(true); + } + clocksource_watchdog_lock(&flags); clocksource_dequeue_watchdog(cs); list_del_init(&cs->list); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19414b16cf9e..d9e659a12c76 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1692,7 +1692,7 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - u64 cycle_now; + u64 cycle_now, nsec; bool inject_sleeptime = false; read_persistent_clock64(&ts_new); @@ -1716,13 +1716,8 @@ void timekeeping_resume(void) * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tk->tkr_mono); - if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > tk->tkr_mono.cycle_last) { - u64 nsec, cyc_delta; - - cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, - tk->tkr_mono.mask); - nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift); + nsec = clocksource_stop_suspend_timing(clock, cycle_now); + if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { @@ -1757,6 +1752,8 @@ int timekeeping_suspend(void) unsigned long flags; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; + struct clocksource *curr_clock; + u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); @@ -1775,6 +1772,15 @@ int timekeeping_suspend(void) timekeeping_forward_now(tk); timekeeping_suspended = 1; + /* + * Since we've called forward_now, cycle_last stores the value + * just read from the current clocksource. Save this to potentially + * use in suspend timing. + */ + curr_clock = tk->tkr_mono.clock; + cycle_now = tk->tkr_mono.cycle_last; + clocksource_start_suspend_timing(curr_clock, cycle_now); + if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, -- cgit v1.2.3 From 9407f5a7ee77c631d1e100436132437cf6237e45 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Jul 2018 10:09:11 +0200 Subject: sched/clock: Close a hole in sched_clock_init() All data required for the 'unstable' sched_clock must be set-up _before_ enabling it -- setting sched_clock_running. This includes the __gtod_offset but also a recent scd stamp. Make the gtod-offset update also set the csd stamp -- it requires the same two clock reads _anyway_. This doesn't hurt in the sched_clock_tick_stable() case and ensures sched_clock_init() gets everything set-up before use. Also switch to unconditional IRQ-disable/enable because the static key stuff already requires this is not ran with IRQs disabled. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180720080911.GM2494@hirez.programming.kicks-ass.net --- kernel/sched/clock.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c5c47ad3f386..811a39aca1ce 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -197,13 +197,14 @@ void clear_sched_clock_stable(void) static void __sched_clock_gtod_offset(void) { - __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; } void __init sched_clock_init(void) { - unsigned long flags; - /* * Set __gtod_offset such that once we mark sched_clock_running, * sched_clock_tick() continues where sched_clock() left off. @@ -211,16 +212,11 @@ void __init sched_clock_init(void) * Even if TSC is buggered, we're still UP at this point so it * can't really be out of sync. */ - local_irq_save(flags); + local_irq_disable(); __sched_clock_gtod_offset(); - local_irq_restore(flags); + local_irq_enable(); static_branch_inc(&sched_clock_running); - - /* Now that sched_clock_running is set adjust scd */ - local_irq_save(flags); - sched_clock_tick(); - local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers, -- cgit v1.2.3 From 488dee96bb62f0b3d9e678cf42574034d5b033a5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:47 +0000 Subject: kernfs: allow creating kernfs objects with arbitrary uid/gid This change allows creating kernfs files and directories with arbitrary uid/gid instead of always using GLOBAL_ROOT_UID/GID by extending kernfs_create_dir_ns() and kernfs_create_file_ns() with uid/gid arguments. The "simple" kernfs_create_file() and kernfs_create_dir() are left alone and always create objects belonging to the global root. When creating symlinks ownership (uid/gid) is taken from the target kernfs object. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- kernel/cgroup/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 077370bf8964..35cf3d71f8aa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3557,7 +3557,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), - cgroup_file_mode(cft), 0, cft->kf_ops, cft, + cgroup_file_mode(cft), + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); -- cgit v1.2.3 From 73d5e2b472640b1fcdb61ae8be389912ef211bda Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 24 Jul 2018 18:17:40 +0200 Subject: cpu/hotplug: detect SMT disabled by BIOS If SMT is disabled in BIOS, the CPU code doesn't properly detect it. The /sys/devices/system/cpu/smt/control file shows 'on', and the 'l1tf' vulnerabilities file shows SMT as vulnerable. Fix it by forcing 'cpu_smt_control' to CPU_SMT_NOT_SUPPORTED in such a case. Unfortunately the detection can only be done after bringing all the CPUs online, so we have to overwrite any previous writes to the variable. Reported-by: Joe Mario Tested-by: Jiri Kosina Fixes: f048c399e0f7 ("x86/topology: Provide topology_smt_supported()") Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra --- kernel/cpu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 37eec872042b..2bc3d2f5b2a5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2125,6 +2125,15 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { + /* + * If SMT was disabled by BIOS, detect it here, after the CPUs have + * been brought online. This ensures the smt/l1tf sysfs entries are + * consistent with reality. Note this may overwrite cpu_smt_control's + * previous setting. + */ + if (topology_max_smt_threads() == 1) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -- cgit v1.2.3 From 2e62c4743adc4c7bfcbc1f45118fc7bec58cf30a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 19 Jul 2018 14:00:06 +0200 Subject: sched/fair: Remove #ifdefs from scale_rt_capacity() Reuse cpu_util_irq() that has been defined for schedutil and set irq util to 0 when !CONFIG_IRQ_TIME_ACCOUNTING. But the compiler is not able to optimize the sequence (at least with aarch64 GCC 7.2.1): free *= (max - irq); free /= max; when irq is fixed to 0 Add a new inline function scale_irq_capacity() that will scale utilization when irq is accounted. Reuse this funciton in schedutil which applies similar formula. Suggested-by: Ingo Molnar Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1532001606-6689-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/cpufreq_schedutil.c | 3 +-- kernel/sched/fair.c | 13 +++---------- kernel/sched/sched.h | 20 ++++++++++++++++++-- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3cf7d992159..fc177c06e490 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -177,7 +177,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 97dcd4472a0e..3fffad3bc8a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -247,8 +247,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * U' = irq + ------- * U * max */ - util *= (max - irq); - util /= max; + util = scale_irq_capacity(util, irq, max); util += irq; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5f7d521e448..14c3fddf822a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7551,16 +7551,12 @@ static unsigned long scale_rt_capacity(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(NULL, cpu); unsigned long used, free; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) unsigned long irq; -#endif -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - irq = READ_ONCE(rq->avg_irq.util_avg); + irq = cpu_util_irq(rq); if (unlikely(irq >= max)) return 1; -#endif used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); @@ -7569,11 +7565,8 @@ static unsigned long scale_rt_capacity(int cpu) return 1; free = max - used; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - free *= (max - irq); - free /= max; -#endif - return free; + + return scale_irq_capacity(free, irq, max); } static void update_cpu_capacity(struct sched_domain *sd, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ebb4b3c3ece7..614170d9b1aa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,6 +856,7 @@ struct rq { struct sched_avg avg_rt; struct sched_avg avg_dl; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#define HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif u64 idle_stamp; @@ -2210,17 +2211,32 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#endif -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { return rq->avg_irq.util_avg; } + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} #else static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; } -#endif +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} #endif -- cgit v1.2.3 From 3d6c50c27bd6418dceb51642540ecfcb8ca708c2 Mon Sep 17 00:00:00 2001 From: Yun Wang Date: Wed, 4 Jul 2018 11:27:27 +0800 Subject: sched/debug: Show the sum wait time of a task group Although we can rely on cpuacct to present the CPU usage of task groups, it is hard to tell how intense the competition is between these groups on CPU resources. Monitoring the wait time or sched_debug of each process could be very expensive, and there is no good way to accurately represent the conflict with these info, we need the wait time on group dimension. Thus we introduce group's wait_sum to represent the resource conflict between task groups, which is simply the sum of the wait time of the group's cfs_rq. The 'cpu.stat' is modified to show the statistic, like: nr_periods 0 nr_throttled 0 throttled_time 0 wait_sum 2035098795584 Now we can monitor the changes of wait_sum to tell how much a a task group is suffering in the fight of CPU resources. For example: (wait_sum - last_wait_sum) * 100 / (nr_cpu * period_ns) == X% means the task group paid X percentage of period on waiting for the CPU. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ff7dae3b-e5f9-7157-1caa-ff02c6b23dc1@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc177c06e490..2bc391a574e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6748,6 +6748,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); + if (schedstat_enabled() && tg != &root_task_group) { + u64 ws = 0; + int i; + + for_each_possible_cpu(i) + ws += schedstat_val(tg->se[i]->statistics.wait_sum); + + seq_printf(sf, "wait_sum %llu\n", ws); + } + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.2.3 From 305c1fac3225dfa7eeb89bfe91b7335a6edd5172 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:43 +0530 Subject: sched/numa: Evaluate move once per node task_numa_compare() helps choose the best CPU to move or swap the selected task. To achieve this task_numa_compare() is called for every CPU in the node. Currently it evaluates if the task can be moved/swapped for each of the CPUs. However the move evaluation is mostly independent of the CPU. Evaluating the move logic once per node, provides scope for simplifying task_numa_compare(). Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25705.2 25058.2 -2.51 1 74433 72950 -1.99 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 96589.6 105930 9.670 1 181830 178624 -1.76 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 440.65 941.32 758.98 189.17 numa01.sh Sys: 183.48 320.07 258.42 50.09 numa01.sh User: 37384.65 71818.14 60302.51 13798.96 numa02.sh Real: 61.24 65.35 62.49 1.49 numa02.sh Sys: 16.83 24.18 21.40 2.60 numa02.sh User: 5219.59 5356.34 5264.03 49.07 numa03.sh Real: 822.04 912.40 873.55 37.35 numa03.sh Sys: 118.80 140.94 132.90 7.60 numa03.sh User: 62485.19 70025.01 67208.33 2967.10 numa04.sh Real: 690.66 872.12 778.49 65.44 numa04.sh Sys: 459.26 563.03 494.03 42.39 numa04.sh User: 51116.44 70527.20 58849.44 8461.28 numa05.sh Real: 418.37 562.28 525.77 54.27 numa05.sh Sys: 299.45 481.00 392.49 64.27 numa05.sh User: 34115.09 41324.02 39105.30 2627.68 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 516.14 892.41 739.84 151.32 2.587% numa01.sh Sys: 153.16 192.99 177.70 14.58 45.42% numa01.sh User: 39821.04 69528.92 57193.87 10989.48 5.435% numa02.sh Real: 60.91 62.35 61.58 0.63 1.477% numa02.sh Sys: 16.47 26.16 21.20 3.85 0.943% numa02.sh User: 5227.58 5309.61 5265.17 31.04 -0.02% numa03.sh Real: 739.07 917.73 795.75 64.45 9.776% numa03.sh Sys: 94.46 136.08 109.48 14.58 21.39% numa03.sh User: 57478.56 72014.09 61764.48 5343.69 8.813% numa04.sh Real: 442.61 715.43 530.31 96.12 46.79% numa04.sh Sys: 224.90 348.63 285.61 48.83 72.97% numa04.sh User: 35836.84 47522.47 40235.41 3985.26 46.26% numa05.sh Real: 386.13 489.17 434.94 43.59 20.88% numa05.sh Sys: 144.29 438.56 278.80 105.78 40.77% numa05.sh User: 33255.86 36890.82 34879.31 1641.98 12.11% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 128 +++++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14c3fddf822a..b10e0663a49e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1580,9 +1580,8 @@ static bool load_too_imbalanced(long src_load, long dst_load, * be exchanged with the source task */ static void task_numa_compare(struct task_numa_env *env, - long taskimp, long groupimp) + long taskimp, long groupimp, bool maymove) { - struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; long src_load, dst_load; @@ -1603,97 +1602,73 @@ static void task_numa_compare(struct task_numa_env *env, if (cur == env->p) goto unlock; + if (!cur) { + if (maymove || imp > env->best_imp) + goto assign; + else + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative - * the value is, the more rmeote accesses that would be expected to + * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. */ - if (cur) { - /* Skip this swap candidate if cannot move to the source CPU: */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) - goto unlock; + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + goto unlock; + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* - * If dst and source tasks are in the same NUMA group, or not - * in any group then look only at task weights. + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. */ - if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - /* - * Add some hysteresis to prevent swapping the - * tasks within a group over tiny differences. - */ - if (cur->numa_group) - imp -= imp/16; - } else { - /* - * Compare the group weights. If a task is all by - * itself (not part of a group), use the task weight - * instead. - */ - if (cur->numa_group) - imp += group_weight(cur, env->src_nid, dist) - - group_weight(cur, env->dst_nid, dist); - else - imp += task_weight(cur, env->src_nid, dist) - - task_weight(cur, env->dst_nid, dist); - } + if (cur->numa_group) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ + if (cur->numa_group && env->p->numa_group) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp && moveimp <= env->best_imp) + if (imp <= env->best_imp) goto unlock; - if (!cur) { - /* Is there capacity at our destination? */ - if (env->src_stats.nr_running <= env->src_stats.task_capacity && - !env->dst_stats.has_free_capacity) - goto unlock; - - goto balance; - } - - /* Balance doesn't matter much if we're running a task per CPU: */ - if (imp > env->best_imp && src_rq->nr_running == 1 && - dst_rq->nr_running == 1) + if (maymove && moveimp > imp && moveimp > env->best_imp) { + imp = moveimp - 1; + cur = NULL; goto assign; + } /* * In the overloaded case, try and keep the load balanced. */ -balance: - load = task_h_load(env->p); + load = task_h_load(env->p) - task_h_load(cur); + if (!load) + goto assign; + dst_load = env->dst_stats.load + load; src_load = env->src_stats.load - load; - if (moveimp > imp && moveimp > env->best_imp) { - /* - * If the improvement from just moving env->p direction is - * better than swapping tasks around, check if a move is - * possible. Store a slightly smaller score than moveimp, - * so an actually idle CPU will win. - */ - if (!load_too_imbalanced(src_load, dst_load, env)) { - imp = moveimp - 1; - cur = NULL; - goto assign; - } - } - - if (imp <= env->best_imp) - goto unlock; - - if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; - } - if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; +assign: /* * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. @@ -1709,7 +1684,6 @@ balance: local_irq_enable(); } -assign: task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); @@ -1718,15 +1692,27 @@ unlock: static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { + long src_load, dst_load, load; + bool maymove = false; int cpu; + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + maymove = !load_too_imbalanced(src_load, dst_load, env); + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp); + task_numa_compare(env, taskimp, groupimp, maymove); } } -- cgit v1.2.3 From 5f95ba7a43057f28a349ea1f03ee8d04e0f445ea Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:44 +0530 Subject: sched/numa: Simplify load_too_imbalanced() Currently load_too_imbalance() cares about the slope of imbalance. It doesn't care of the direction of the imbalance. However this may not work if nodes that are being compared have dissimilar capacities. Few nodes might have more cores than other nodes in the system. Also unlike traditional load balance at a NUMA sched domain, multiple requests to migrate from the same source node to same destination node may run in parallel. This can cause huge load imbalance. This is specially true on a larger machines with either large cores per node or more number of nodes in the system. Hence allow move/swap only if the imbalance is going to reduce. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25058.2 25122.9 0.25 1 72950 73850 1.23 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 516.14 892.41 739.84 151.32 numa01.sh Sys: 153.16 192.99 177.70 14.58 numa01.sh User: 39821.04 69528.92 57193.87 10989.48 numa02.sh Real: 60.91 62.35 61.58 0.63 numa02.sh Sys: 16.47 26.16 21.20 3.85 numa02.sh User: 5227.58 5309.61 5265.17 31.04 numa03.sh Real: 739.07 917.73 795.75 64.45 numa03.sh Sys: 94.46 136.08 109.48 14.58 numa03.sh User: 57478.56 72014.09 61764.48 5343.69 numa04.sh Real: 442.61 715.43 530.31 96.12 numa04.sh Sys: 224.90 348.63 285.61 48.83 numa04.sh User: 35836.84 47522.47 40235.41 3985.26 numa05.sh Real: 386.13 489.17 434.94 43.59 numa05.sh Sys: 144.29 438.56 278.80 105.78 numa05.sh User: 33255.86 36890.82 34879.31 1641.98 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 435.78 653.81 534.58 83.20 38.39% numa01.sh Sys: 121.93 187.18 145.90 23.47 21.79% numa01.sh User: 37082.81 51402.80 43647.60 5409.75 31.03% numa02.sh Real: 60.64 61.63 61.19 0.40 0.637% numa02.sh Sys: 14.72 25.68 19.06 4.03 11.22% numa02.sh User: 5210.95 5266.69 5233.30 20.82 0.608% numa03.sh Real: 746.51 808.24 780.36 23.88 1.972% numa03.sh Sys: 97.26 108.48 105.07 4.28 4.197% numa03.sh User: 58956.30 61397.05 60162.95 1050.82 2.661% numa04.sh Real: 465.97 519.27 484.81 19.62 9.385% numa04.sh Sys: 304.43 359.08 334.68 20.64 -14.6% numa04.sh User: 37544.16 41186.15 39262.44 1314.91 2.478% numa05.sh Real: 411.57 457.20 433.29 16.58 0.380% numa05.sh Sys: 230.05 435.48 339.95 67.58 -17.9% numa05.sh User: 33325.54 36896.31 35637.84 1222.64 -2.12% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b10e0663a49e..226837960ec0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1546,28 +1546,12 @@ static bool load_too_imbalanced(long src_load, long dst_load, src_capacity = env->src_stats.compute_capacity; dst_capacity = env->dst_stats.compute_capacity; - /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); - - /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; - if (imb <= 0) - return false; + imb = abs(dst_load * src_capacity - src_load * dst_capacity); - /* - * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. - */ orig_src_load = env->src_stats.load; orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); /* Would this change make things worse? */ return (imb > old_imb); -- cgit v1.2.3 From 8cd45eee43bd46b933158b25aa7c742e0f3e811f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:45 +0530 Subject: sched/numa: Set preferred_node based on best_cpu Currently preferred node is set to dst_nid which is the last node in the iteration whose group weight or task weight is greater than the current node. However it doesn't guarantee that dst_nid has the numa capacity to move. It also doesn't guarantee that dst_nid has the best_cpu which is the CPU/node ideal for node migration. Lets consider faults on a 4 node system with group weight numbers in different nodes being in 0 < 1 < 2 < 3 proportion. Consider the task is running on 3 and 0 is its preferred node but its capacity is full. Consider nodes 1, 2 and 3 have capacity. Then the task should be migrated to node 1. Currently the task gets moved to node 2. env.dst_nid points to the last node whose faults were greater than current node. Modify to set the preferred node based of best_cpu. Earlier setting preferred node was skipped if nr_active_nodes is 1. This could result in the task being moved out of the preferred node to a random node during regular load balancing. Also while modifying task_numa_migrate(), use sched_setnuma to set preferred node. This ensures out numa accounting is correct. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25122.9 25549.6 1.698 1 73850 73190 -0.89 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 105930 113437 7.08676 1 178624 196130 9.80047 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 435.78 653.81 534.58 83.20 numa01.sh Sys: 121.93 187.18 145.90 23.47 numa01.sh User: 37082.81 51402.80 43647.60 5409.75 numa02.sh Real: 60.64 61.63 61.19 0.40 numa02.sh Sys: 14.72 25.68 19.06 4.03 numa02.sh User: 5210.95 5266.69 5233.30 20.82 numa03.sh Real: 746.51 808.24 780.36 23.88 numa03.sh Sys: 97.26 108.48 105.07 4.28 numa03.sh User: 58956.30 61397.05 60162.95 1050.82 numa04.sh Real: 465.97 519.27 484.81 19.62 numa04.sh Sys: 304.43 359.08 334.68 20.64 numa04.sh User: 37544.16 41186.15 39262.44 1314.91 numa05.sh Real: 411.57 457.20 433.29 16.58 numa05.sh Sys: 230.05 435.48 339.95 67.58 numa05.sh User: 33325.54 36896.31 35637.84 1222.64 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 506.35 794.46 599.06 104.26 -10.76% numa01.sh Sys: 150.37 223.56 195.99 24.94 -25.55% numa01.sh User: 43450.69 61752.04 49281.50 6635.33 -11.43% numa02.sh Real: 60.33 62.40 61.31 0.90 -0.195% numa02.sh Sys: 18.12 31.66 24.28 5.89 -21.49% numa02.sh User: 5203.91 5325.32 5260.29 49.98 -0.513% numa03.sh Real: 696.47 853.62 745.80 57.28 4.6339% numa03.sh Sys: 85.68 123.71 97.89 13.48 7.3347% numa03.sh User: 55978.45 66418.63 59254.94 3737.97 1.5323% numa04.sh Real: 444.05 514.83 497.06 26.85 -2.464% numa04.sh Sys: 230.39 375.79 316.23 48.58 5.8343% numa04.sh User: 35403.12 41004.10 39720.80 2163.08 -1.153% numa05.sh Real: 423.09 460.41 439.57 13.92 -1.428% numa05.sh Sys: 287.38 480.15 369.37 68.52 -7.964% numa05.sh User: 34732.12 38016.80 36255.85 1070.51 -1.704% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-5-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 226837960ec0..0532195c38d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1765,7 +1765,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = task_node(p); + sched_setnuma(p, task_node(p)); return -EINVAL; } @@ -1824,15 +1824,13 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { - struct numa_group *ng = p->numa_group; - if (env.best_cpu == -1) nid = env.src_nid; else - nid = env.dst_nid; + nid = cpu_to_node(env.best_cpu); - if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) - sched_setnuma(p, env.dst_nid); + if (nid != p->numa_preferred_nid) + sched_setnuma(p, nid); } /* No better CPU than the current one was found. */ -- cgit v1.2.3 From f03bb6760b8e5e2bcecc88d2a2ef41c09adcab39 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:46 +0530 Subject: sched/numa: Use task faults only if numa_group is not yet set up When numa_group faults are available, task_numa_placement only uses numa_group faults to evaluate preferred node. However it still accounts task faults and even evaluates the preferred node just based on task faults just to discard it in favour of preferred node chosen on the basis of numa_group. Instead use task faults only if numa_group is not set. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25549.6 25215.7 -1.30 1 73190 72107 -1.47 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 113437 113372 -0.05 1 196130 177403 -9.54 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 506.35 794.46 599.06 104.26 numa01.sh Sys: 150.37 223.56 195.99 24.94 numa01.sh User: 43450.69 61752.04 49281.50 6635.33 numa02.sh Real: 60.33 62.40 61.31 0.90 numa02.sh Sys: 18.12 31.66 24.28 5.89 numa02.sh User: 5203.91 5325.32 5260.29 49.98 numa03.sh Real: 696.47 853.62 745.80 57.28 numa03.sh Sys: 85.68 123.71 97.89 13.48 numa03.sh User: 55978.45 66418.63 59254.94 3737.97 numa04.sh Real: 444.05 514.83 497.06 26.85 numa04.sh Sys: 230.39 375.79 316.23 48.58 numa04.sh User: 35403.12 41004.10 39720.80 2163.08 numa05.sh Real: 423.09 460.41 439.57 13.92 numa05.sh Sys: 287.38 480.15 369.37 68.52 numa05.sh User: 34732.12 38016.80 36255.85 1070.51 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 478.45 565.90 515.11 30.87 16.29% numa01.sh Sys: 207.79 271.04 232.94 21.33 -15.8% numa01.sh User: 39763.93 47303.12 43210.73 2644.86 14.04% numa02.sh Real: 60.00 61.46 60.78 0.49 0.871% numa02.sh Sys: 15.71 25.31 20.69 3.42 17.35% numa02.sh User: 5175.92 5265.86 5235.97 32.82 0.464% numa03.sh Real: 776.42 834.85 806.01 23.22 -7.47% numa03.sh Sys: 114.43 128.75 121.65 5.49 -19.5% numa03.sh User: 60773.93 64855.25 62616.91 1576.39 -5.36% numa04.sh Real: 456.93 511.95 482.91 20.88 2.930% numa04.sh Sys: 178.09 460.89 356.86 94.58 -11.3% numa04.sh User: 36312.09 42553.24 39623.21 2247.96 0.246% numa05.sh Real: 393.98 493.48 436.61 35.59 0.677% numa05.sh Sys: 164.49 329.15 265.87 61.78 38.92% numa05.sh User: 33182.65 36654.53 35074.51 1187.71 3.368% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-6-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0532195c38d0..a10c4f8f47e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2110,8 +2110,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1, max_group_nid = -1; - unsigned long max_faults = 0, max_group_faults = 0; + int seq, nid, max_nid = -1; + unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; u64 runtime, period; @@ -2190,15 +2190,15 @@ static void task_numa_placement(struct task_struct *p) } } - if (faults > max_faults) { - max_faults = faults; + if (!p->numa_group) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } else if (group_faults > max_faults) { + max_faults = group_faults; max_nid = nid; } - - if (group_faults > max_group_faults) { - max_group_faults = group_faults; - max_group_nid = nid; - } } update_task_scan_period(p, fault_types[0], fault_types[1]); @@ -2206,7 +2206,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); - max_nid = preferred_group_nid(p, max_group_nid); + max_nid = preferred_group_nid(p, max_nid); } if (max_faults) { -- cgit v1.2.3 From 67d9f6c256cd66e15f85c92670f52a7ad4689cff Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:47 +0530 Subject: sched/debug: Reverse the order of printing faults Fix the order in which the private and shared numa faults are getting printed. No functional changes. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25215.7 25375.3 0.63 1 72107 72617 0.70 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-7-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c96e89cc4bc7..870d4f3da285 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -842,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf) { SEQ_printf(m, "numa_faults node=%d ", node); - SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); - SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); + SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf); + SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf); } #endif -- cgit v1.2.3 From 0ee7e74dc0dc64d9900751d03c5c22dfdd173fb8 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:48 +0530 Subject: sched/numa: Skip nodes that are at 'hoplimit' When comparing two nodes at a distance of 'hoplimit', we should consider nodes only up to 'hoplimit'. Currently we also consider nodes at 'oplimit' distance too. Hence two nodes at a distance of 'hoplimit' will have same groupweight. Fix this by skipping nodes at hoplimit. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25375.3 25308.6 -0.26 1 72617 72964 0.477 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 113372 108750 -4.07684 1 177403 183115 3.21979 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 478.45 565.90 515.11 30.87 numa01.sh Sys: 207.79 271.04 232.94 21.33 numa01.sh User: 39763.93 47303.12 43210.73 2644.86 numa02.sh Real: 60.00 61.46 60.78 0.49 numa02.sh Sys: 15.71 25.31 20.69 3.42 numa02.sh User: 5175.92 5265.86 5235.97 32.82 numa03.sh Real: 776.42 834.85 806.01 23.22 numa03.sh Sys: 114.43 128.75 121.65 5.49 numa03.sh User: 60773.93 64855.25 62616.91 1576.39 numa04.sh Real: 456.93 511.95 482.91 20.88 numa04.sh Sys: 178.09 460.89 356.86 94.58 numa04.sh User: 36312.09 42553.24 39623.21 2247.96 numa05.sh Real: 393.98 493.48 436.61 35.59 numa05.sh Sys: 164.49 329.15 265.87 61.78 numa05.sh User: 33182.65 36654.53 35074.51 1187.71 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 414.64 819.20 556.08 147.70 -7.36% numa01.sh Sys: 77.52 205.04 139.40 52.05 67.10% numa01.sh User: 37043.24 61757.88 45517.48 9290.38 -5.06% numa02.sh Real: 60.80 63.32 61.63 0.88 -1.37% numa02.sh Sys: 17.35 39.37 25.71 7.33 -19.5% numa02.sh User: 5213.79 5374.73 5268.90 55.09 -0.62% numa03.sh Real: 780.09 948.64 831.43 63.02 -3.05% numa03.sh Sys: 104.96 136.92 116.31 11.34 4.591% numa03.sh User: 60465.42 73339.78 64368.03 4700.14 -2.72% numa04.sh Real: 412.60 681.92 521.29 96.64 -7.36% numa04.sh Sys: 210.32 314.10 251.77 37.71 41.74% numa04.sh User: 34026.38 45581.20 38534.49 4198.53 2.825% numa05.sh Real: 394.79 439.63 411.35 16.87 6.140% numa05.sh Sys: 238.32 330.09 292.31 38.32 -9.04% numa05.sh User: 33456.45 34876.07 34138.62 609.45 2.741% While there is a regression with this change, this change is needed from a correctness perspective. Also it helps consolidation as seen from perf bench output. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-8-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a10c4f8f47e8..e5f39e8dfe53 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1312,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * of each group. Skip other nodes. */ if (sched_numa_topology_type == NUMA_BACKPLANE && - dist > maxdist) + dist >= maxdist) continue; /* Add up the faults from nearby nodes. */ -- cgit v1.2.3 From 10864a9e222048a862da2c21efa28929a4dfed15 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:49 +0530 Subject: sched/numa: Remove unused task_capacity from 'struct numa_stats' The task_capacity field in 'struct numa_stats' is redundant. Also move nr_running for better packing within the struct. No functional changes. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25308.6 25377.3 0.271 1 72964 72287 -0.92 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-9-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e5f39e8dfe53..4ac60b296d96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1450,14 +1450,12 @@ static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { - unsigned long nr_running; unsigned long load; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long task_capacity; + unsigned int nr_running; int has_free_capacity; }; @@ -1495,9 +1493,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); capacity = cpus / smt; /* cores */ - ns->task_capacity = min_t(unsigned, capacity, + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < ns->task_capacity); + ns->has_free_capacity = (ns->nr_running < capacity); } struct task_numa_env { -- cgit v1.2.3 From 0ad4e3dfe6cf3f207e61cbd8e3e4a943f8c1ad20 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:50 +0530 Subject: sched/numa: Modify migrate_swap() to accept additional parameters There are checks in migrate_swap_stop() that check if the task/CPU combination is as per migrate_swap_arg before migrating. However atleast one of the two tasks to be swapped by migrate_swap() could have migrated to a completely different CPU before updating the migrate_swap_arg. The new CPU where the task is currently running could be a different node too. If the task has migrated, numa balancer might end up placing a task in a wrong node. Instead of achieving node consolidation, it may end up spreading the load across nodes. To avoid that pass the CPUs as additional parameters. While here, place migrate_swap under CONFIG_NUMA_BALANCING. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25377.3 25226.6 -0.59 1 72287 73326 1.437 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-10-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- kernel/sched/fair.c | 3 ++- kernel/sched/sched.h | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2bc391a574e6..deafa9fe602b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1176,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1257,16 +1258,17 @@ unlock: /* * Cross migrate two tasks */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +int migrate_swap(struct task_struct *cur, struct task_struct *p, + int target_cpu, int curr_cpu) { struct migration_swap_arg arg; int ret = -EINVAL; arg = (struct migration_swap_arg){ .src_task = cur, - .src_cpu = task_cpu(cur), + .src_cpu = curr_cpu, .dst_task = p, - .dst_cpu = task_cpu(p), + .dst_cpu = target_cpu, }; if (arg.src_cpu == arg.dst_cpu) @@ -1291,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) out: return ret; } +#endif /* CONFIG_NUMA_BALANCING */ /* * wait_task_inactive - wait for a thread to unschedule. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ac60b296d96..7b4eddec3ccc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1848,7 +1848,8 @@ static int task_numa_migrate(struct task_struct *p) return ret; } - ret = migrate_swap(p, env.best_task); + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 614170d9b1aa..4a2e8cae63c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1099,7 +1099,8 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *, struct task_struct *); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void -- cgit v1.2.3 From 2d4056fafa196e1ab4e7161bae4df76f9602d56d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:53 +0530 Subject: sched/numa: Remove numa_has_capacity() task_numa_find_cpu() helps to find the CPU to swap/move the task to. It's guarded by numa_has_capacity(). However node not having capacity shouldn't deter a task swapping if it helps NUMA placement. Further load_too_imbalanced(), which evaluates possibilities of move/swap, provides similar checks as numa_has_capacity. Hence remove numa_has_capacity() to enhance possibilities of task swapping even if load is imbalanced. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25657.9 25804.1 0.569 1 74435 73413 -1.37 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-13-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b4eddec3ccc..3bcf0e864613 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1456,7 +1456,6 @@ struct numa_stats { unsigned long compute_capacity; unsigned int nr_running; - int has_free_capacity; }; /* @@ -1483,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_free_capacity, or we'll detect a huge - * imbalance and bail there. + * We'll detect a huge imbalance and bail there. */ if (!cpus) return; @@ -1495,7 +1493,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); - ns->has_free_capacity = (ns->nr_running < capacity); } struct task_numa_env { @@ -1698,31 +1695,6 @@ static void task_numa_find_cpu(struct task_numa_env *env, } } -/* Only move tasks to a NUMA node less busy than the current node. */ -static bool numa_has_capacity(struct task_numa_env *env) -{ - struct numa_stats *src = &env->src_stats; - struct numa_stats *dst = &env->dst_stats; - - if (src->has_free_capacity && !dst->has_free_capacity) - return false; - - /* - * Only consider a task move if the source has a higher load - * than the destination, corrected for CPU capacity on each node. - * - * src->load dst->load - * --------------------- vs --------------------- - * src->compute_capacity dst->compute_capacity - */ - if (src->load * dst->compute_capacity * env->imbalance_pct > - - dst->load * src->compute_capacity * 100) - return true; - - return false; -} - static int task_numa_migrate(struct task_struct *p) { struct task_numa_env env = { @@ -1777,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p) update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); /* * Look at other nodes in these cases: @@ -1808,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p) env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); - if (numa_has_capacity(&env)) - task_numa_find_cpu(&env, taskimp, groupimp); + task_numa_find_cpu(&env, taskimp, groupimp); } } -- cgit v1.2.3 From 30619c89b17d46808b4cdf5b3f81b6a01ade1473 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:55 +0530 Subject: sched/numa: Update the scan period without holding the numa_group lock The metrics for updating scan periods are local or task specific. Currently this update happens under the numa_group lock, which seems unnecessary. Hence move this update outside the lock. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25355.9 25645.4 1.141 1 72812 72142 -0.92 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-15-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bcf0e864613..fc33a4b40a09 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2170,8 +2170,6 @@ static void task_numa_placement(struct task_struct *p) } } - update_task_scan_period(p, fault_types[0], fault_types[1]); - if (p->numa_group) { numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); @@ -2186,6 +2184,8 @@ static void task_numa_placement(struct task_struct *p) if (task_node(p) != p->numa_preferred_nid) numa_migrate_preferred(p); } + + update_task_scan_period(p, fault_types[0], fault_types[1]); } static inline int get_numa_group(struct numa_group *grp) -- cgit v1.2.3 From f35678b6a17063f3b0d391af5ab8f8c83cf31b0c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:32:56 +0530 Subject: sched/numa: Use group_weights to identify if migration degrades locality On NUMA_BACKPLANE and NUMA_GLUELESS_MESH systems, tasks/memory should be consolidated to the closest group of nodes. In such a case, relying on group_fault metric may not always help to consolidate. There can always be a case where a node closer to the preferred node may have lesser faults than a node further away from the preferred node. In such a case, moving to node with more faults might avoid numa consolidation. Using group_weight would help to consolidate task/memory around the preferred_node. While here, to be on the conservative side, don't override migrate thread degrades locality logic for CPU_NEWLY_IDLE load balancing. Note: Similar problems exist with should_numa_migrate_memory and will be dealt separately. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25645.4 25960 1.22 1 72142 73550 1.95 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 110199 120071 8.958 1 176303 176249 -0.03 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 490.04 774.86 596.26 96.46 numa01.sh Sys: 151.52 242.88 184.82 31.71 numa01.sh User: 41418.41 60844.59 48776.09 6564.27 numa02.sh Real: 60.14 62.94 60.98 1.00 numa02.sh Sys: 16.11 30.77 21.20 5.28 numa02.sh User: 5184.33 5311.09 5228.50 44.24 numa03.sh Real: 790.95 856.35 826.41 24.11 numa03.sh Sys: 114.93 118.85 117.05 1.63 numa03.sh User: 60990.99 64959.28 63470.43 1415.44 numa04.sh Real: 434.37 597.92 504.87 59.70 numa04.sh Sys: 237.63 397.40 289.74 55.98 numa04.sh User: 34854.87 41121.83 38572.52 2615.84 numa05.sh Real: 386.77 448.90 417.22 22.79 numa05.sh Sys: 149.23 379.95 303.04 79.55 numa05.sh User: 32951.76 35959.58 34562.18 1034.05 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 493.19 672.88 597.51 59.38 -0.20% numa01.sh Sys: 150.09 245.48 207.76 34.26 -11.0% numa01.sh User: 41928.51 53779.17 48747.06 3901.39 0.059% numa02.sh Real: 60.63 62.87 61.22 0.83 -0.39% numa02.sh Sys: 16.64 27.97 20.25 4.06 4.691% numa02.sh User: 5222.92 5309.60 5254.03 29.98 -0.48% numa03.sh Real: 821.52 902.15 863.60 32.41 -4.30% numa03.sh Sys: 112.04 130.66 118.35 7.08 -1.09% numa03.sh User: 62245.16 69165.14 66443.04 2450.32 -4.47% numa04.sh Real: 414.53 519.57 476.25 37.00 6.009% numa04.sh Sys: 181.84 335.67 280.41 54.07 3.327% numa04.sh User: 33924.50 39115.39 37343.78 1934.26 3.290% numa05.sh Real: 408.30 441.45 417.90 12.05 -0.16% numa05.sh Sys: 233.41 381.60 295.58 57.37 2.523% numa05.sh User: 33301.31 35972.50 34335.19 938.94 0.661% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-16-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc33a4b40a09..9c9e54ea65d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6899,8 +6899,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) return -1; @@ -6927,18 +6927,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return 0; /* Leaving a core idle is often worse than degrading locality. */ - if (env->idle != CPU_NOT_IDLE) + if (env->idle == CPU_IDLE) return -1; + dist = node_distance(src_nid, dst_nid); if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); } - return dst_faults < src_faults; + return dst_weight < src_weight; } #else -- cgit v1.2.3 From b6a60cf36d497e7fbde9dd5b86fabd96850249f6 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 20 Jun 2018 22:33:00 +0530 Subject: sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() numa_migrate_preferred() is called periodically or when task preferred node changes. Preferred node evaluations happen once per scan sequence. If the scan completion happens just after the periodic NUMA migration, then we try to migrate to the preferred node and the preferred node might change, needing another node migration. Avoid this by checking for scan sequence completion only when checking for periodic migration. Running SPECjbb2005 on a 4 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 16 25862.6 26158.1 1.14258 1 74357 72725 -2.19482 Running SPECjbb2005 on a 16 node machine and comparing bops/JVM JVMS LAST_PATCH WITH_PATCH %CHANGE 8 117019 113992 -2.58 1 179095 174947 -2.31 (numbers from v1 based on v4.17-rc5) Testcase Time: Min Max Avg StdDev numa01.sh Real: 449.46 770.77 615.22 101.70 numa01.sh Sys: 132.72 208.17 170.46 24.96 numa01.sh User: 39185.26 60290.89 50066.76 6807.84 numa02.sh Real: 60.85 61.79 61.28 0.37 numa02.sh Sys: 15.34 24.71 21.08 3.61 numa02.sh User: 5204.41 5249.85 5231.21 17.60 numa03.sh Real: 785.50 916.97 840.77 44.98 numa03.sh Sys: 108.08 133.60 119.43 8.82 numa03.sh User: 61422.86 70919.75 64720.87 3310.61 numa04.sh Real: 429.57 587.37 480.80 57.40 numa04.sh Sys: 240.61 321.97 290.84 33.58 numa04.sh User: 34597.65 40498.99 37079.48 2060.72 numa05.sh Real: 392.09 431.25 414.65 13.82 numa05.sh Sys: 229.41 372.48 297.54 53.14 numa05.sh User: 33390.86 34697.49 34222.43 556.42 Testcase Time: Min Max Avg StdDev %Change numa01.sh Real: 424.63 566.18 498.12 59.26 23.50% numa01.sh Sys: 160.19 256.53 208.98 37.02 -18.4% numa01.sh User: 37320.00 46225.58 42001.57 3482.45 19.20% numa02.sh Real: 60.17 62.47 60.91 0.85 0.607% numa02.sh Sys: 15.30 22.82 17.04 2.90 23.70% numa02.sh User: 5202.13 5255.51 5219.08 20.14 0.232% numa03.sh Real: 823.91 844.89 833.86 8.46 0.828% numa03.sh Sys: 130.69 148.29 140.47 6.21 -14.9% numa03.sh User: 62519.15 64262.20 63613.38 620.05 1.740% numa04.sh Real: 515.30 603.74 548.56 30.93 -12.3% numa04.sh Sys: 459.73 525.48 489.18 21.63 -40.5% numa04.sh User: 40561.96 44919.18 42047.87 1526.85 -11.8% numa05.sh Real: 396.58 454.37 421.13 19.71 -1.53% numa05.sh Sys: 208.72 422.02 348.90 73.60 -14.7% numa05.sh User: 33124.08 36109.35 34846.47 1089.74 -1.79% Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1529514181-9842-20-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9c9e54ea65d9..309c93fcc604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2180,9 +2180,6 @@ static void task_numa_placement(struct task_struct *p) /* Set the new preferred node */ if (max_nid != p->numa_preferred_nid) sched_setnuma(p, max_nid); - - if (task_node(p) != p->numa_preferred_nid) - numa_migrate_preferred(p); } update_task_scan_period(p, fault_types[0], fault_types[1]); @@ -2385,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) numa_is_active_node(mem_node, ng)) local = 1; - task_numa_placement(p); - /* * Retry task to preferred node migration periodically, in case it * case it previously failed, or the scheduler moved us. */ - if (time_after(jiffies, p->numa_migrate_retry)) + if (time_after(jiffies, p->numa_migrate_retry)) { + task_numa_placement(p); numa_migrate_preferred(p); + } if (migrated) p->numa_pages_migrated += pages; -- cgit v1.2.3 From 7d63fb3af87aa67aa7d24466e792f9d7c57d8e79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Jul 2018 16:22:22 -0700 Subject: swiotlb: clean up reporting This removes needless use of '%p', and refactors the printk calls to use pr_*() helpers instead. Signed-off-by: Kees Cook Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 904541055792..4f8a6dbf0b60 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -17,6 +17,8 @@ * 08/12/11 beckyb Add highmem support */ +#define pr_fmt(fmt) "software IO TLB: " fmt + #include #include #include @@ -162,20 +164,16 @@ static bool no_iotlb_memory; void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; if (no_iotlb_memory) { - pr_warn("software IO TLB: No low mem\n"); + pr_warn("No low mem\n"); return; } - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); - - printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", (unsigned long long)io_tlb_start, (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); + bytes >> 20); } /* @@ -275,7 +273,7 @@ swiotlb_init(int verbose) if (io_tlb_start) memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - pr_warn("Cannot allocate SWIOTLB buffer"); + pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -317,8 +315,8 @@ swiotlb_late_init_with_default_size(size_t default_size) return -ENOMEM; } if (order != get_order(bytes)) { - printk(KERN_WARNING "Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; } rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); -- cgit v1.2.3 From 6f4ceee9305dc3fe74099159b460f4b56b506f1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 24 Jul 2018 14:26:04 -0400 Subject: cpu/hotplug: Add a cpus_read_trylock() function There are use cases where it can be useful to have a cpus_read_trylock() function to work around circular lock dependency problem involving the cpu_hotplug_lock. Signed-off-by: Waiman Long Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..307486baa477 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -290,6 +290,12 @@ void cpus_read_lock(void) } EXPORT_SYMBOL_GPL(cpus_read_lock); +int cpus_read_trylock(void) +{ + return percpu_down_read_trylock(&cpu_hotplug_lock); +} +EXPORT_SYMBOL_GPL(cpus_read_trylock); + void cpus_read_unlock(void) { percpu_up_read(&cpu_hotplug_lock); -- cgit v1.2.3 From 1fee4f77523a3e5a44fa33c9aa76de8c6c42a632 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Jul 2018 21:43:36 +0900 Subject: doc: tracing: Fix a typo of trace_stat The name of the directory for per-cpu function statistics is trace_stat, not trace_stats. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2bd4a9181a0f..9a27f146fa1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -521,7 +521,7 @@ config FUNCTION_PROFILER in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A "functions" file is created in - the trace_stats directory; this file shows the list of functions that + the trace_stat directory; this file shows the list of functions that have been hit and their counters. If in doubt, say N. -- cgit v1.2.3 From f07d141fe9430cdf9f8a65a87c4136bd83b8ab2e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 23 Jul 2018 23:16:07 +0100 Subject: dma-mapping: Generalise dma_32bit_limit flag Whilst the notion of an upstream DMA restriction is most commonly seen in PCI host bridges saddled with a 32-bit native interface, a more general version of the same issue can exist on complex SoCs where a bus or point-to-point interconnect link from a device's DMA master interface to another component along the path to memory (often an IOMMU) may carry fewer address bits than the interfaces at both ends nominally support. In order to properly deal with this, the first step is to expand the dma_32bit_limit flag into an arbitrary mask. To minimise the impact on existing code, we'll make sure to only consider this new mask valid if set. That makes sense anyway, since a mask of zero would represent DMA not being wired up at all, and that would be better handled by not providing valid ops in the first place. Signed-off-by: Robin Murphy Acked-by: Ard Biesheuvel Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8be8106270c2..c2860c5a9e96 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -180,10 +180,10 @@ int dma_direct_supported(struct device *dev, u64 mask) return 0; #endif /* - * Various PCI/PCIe bridges have broken support for > 32bit DMA even - * if the device itself might support it. + * Upstream PCI/PCIe bridges or SoC interconnects may not carry + * as many DMA address bits as the device itself supports. */ - if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + if (dev->bus_dma_mask && mask > dev->bus_dma_mask) return 0; return 1; } -- cgit v1.2.3 From 684ad537abff987886d63fb3c573eeca40d7f2db Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 25 Jul 2018 16:00:18 -0400 Subject: timekeeping: Prevent false warning when persistent clock is not available On arches with no persistent clock a message like this is printed during boot: [ 0.000000] Persistent clock returned invalid value The value is not invalid: Zero means that no persistent clock is available and the absence of persistent clock should be quietly accepted. Fixes: 3eca993740b8 ("timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()") Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: sboyd@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/20180725200018.23722-1-pasha.tatashin@oracle.com --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 30d7f64ffc87..6183e7460138 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1536,7 +1536,7 @@ void __init timekeeping_init(void) if (timespec64_valid_strict(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; - } else { + } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } -- cgit v1.2.3 From bd9f943e5d2a42d864f9692477a25034c9d47dcc Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 30 Jul 2018 09:52:52 -0400 Subject: sched/clock: Disable interrupts when calling generic_sched_clock_init() sched_clock_init() used be called early during boot when interrupts were still disabled. After the recent changes to utilize sched clock early the sched_clock_init() call happens when interrupts are already enabled, which triggers the following warning: WARNING: CPU: 0 PID: 0 at kernel/time/sched_clock.c:180 sched_clock_register+0x44/0x278 [] (warn_slowpath_null) from [] (sched_clock_register+0x44/0x278) [] (sched_clock_register) from [] (generic_sched_clock_init+0x28/0x88) [] (generic_sched_clock_init) from [] (sched_clock_init+0x54/0x74) [] (sched_clock_init) from [] (start_kernel+0x310/0x3e4) [] (start_kernel) from [<00000000>] ( (null)) Disable IRQs for the duration of generic_sched_clock_init(). Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Pavel Tatashin Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Link: https://lkml.kernel.org/r/20180730135252.24599-1-pasha.tatashin@oracle.com --- kernel/sched/clock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 811a39aca1ce..e3e3b979f9bd 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -452,7 +452,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); void __init sched_clock_init(void) { static_branch_inc(&sched_clock_running); + local_irq_disable(); generic_sched_clock_init(); + local_irq_enable(); } u64 sched_clock_cpu(int cpu) -- cgit v1.2.3 From d018031f562b9c2eff038969ab1955a370c52d8f Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 24 Jul 2018 20:17:48 +0530 Subject: cpu/hotplug: Clarify CPU hotplug step name for timers After commit 249d4a9b3246 ("timers: Reinitialize per cpu bases on hotplug") i.e. the introduction of state CPUHP_TIMERS_PREPARE instead of CPUHP_TIMERS_DEAD the step name "timers:dead" is not longer accurate. Rename it to "timers:prepare". [ tglx: Massaged changelog ] Signed-off-by: Mukesh Ojha Signed-off-by: Thomas Gleixner Cc: gkohli@codeaurora.org Cc: neeraju@codeaurora.org Cc: Peter Zijlstra Cc: Lai Jiangshan Cc: Brendan Jackman Cc: Mathieu Malaterre Link: https://lkml.kernel.org/r/1532443668-26810-1-git-send-email-mojha@codeaurora.org --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 0db8938fbb23..8e6606ac3d72 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1274,7 +1274,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_PREPARE] = { - .name = "timers:dead", + .name = "timers:prepare", .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, -- cgit v1.2.3 From 9d2dcc8fc66087d7fd365e07cd4292adc873e568 Mon Sep 17 00:00:00 2001 From: Michael O'Farrell Date: Mon, 30 Jul 2018 13:14:34 -0700 Subject: arm64: perf: Add cap_user_time aarch64 It is useful to get the running time of a thread. Doing so in an efficient manner can be important for performance of user applications. Avoiding system calls in `clock_gettime` when handling CLOCK_THREAD_CPUTIME_ID is important. Other clocks are handled in the VDSO, but CLOCK_THREAD_CPUTIME_ID falls back on the system call. CLOCK_THREAD_CPUTIME_ID is not handled in the VDSO since it would have costs associated with maintaining updated user space accessible time offsets. These offsets have to be updated everytime the a thread is scheduled/descheduled. However, for programs regularly checking the running time of a thread, this is a performance improvement. This patch takes a middle ground, and adds support for cap_user_time an optional feature of the perf_event API. This way costs are only incurred when the perf_event api is enabled. This is done the same way as it is in x86. Ultimately this allows calculating the thread running time in userspace on aarch64 as follows (adapted from perf_event_open manpage): u32 seq, time_mult, time_shift; u64 running, count, time_offset, quot, rem, delta; struct perf_event_mmap_page *pc; pc = buf; // buf is the perf event mmaped page as documented in the API. if (pc->cap_usr_time) { do { seq = pc->lock; barrier(); running = pc->time_running; count = readCNTVCT_EL0(); // Read ARM hardware clock. time_offset = pc->time_offset; time_mult = pc->time_mult; time_shift = pc->time_shift; barrier(); } while (pc->lock != seq); quot = (count >> time_shift); rem = count & (((u64)1 << time_shift) - 1); delta = time_offset + quot * time_mult + ((rem * time_mult) >> time_shift); running += delta; // running now has the current nanosecond level thread time. } Summary of changes in the patch: For aarch64 systems, make arch_perf_update_userpage update the timing information stored in the perf_event page. Requiring the following calculations: - Calculate the appropriate time_mult, and time_shift factors to convert ticks to nano seconds for the current clock frequency. - Adjust the mult and shift factors to avoid shift factors of 32 bits. (possibly unnecessary) - The time_offset userspace should apply when doing calculations: negative the current sched time (now), because time_running and time_enabled fields of the perf_event page have just been updated. Toggle bits to appropriate values: - Enable cap_user_time Signed-off-by: Michael O'Farrell Signed-off-by: Will Deacon --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..1a16dcca0da2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5246,8 +5246,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg = rb->user_page; /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. */ preempt_disable(); ++userpg->lock; -- cgit v1.2.3 From fbeb1603bf4e9baa82da8f794de42949d0fe5e25 Mon Sep 17 00:00:00 2001 From: Arthur Fabre Date: Tue, 31 Jul 2018 18:17:22 +0100 Subject: bpf: verifier: MOV64 don't mark dst reg unbounded When check_alu_op() handles a BPF_MOV64 between two registers, it calls check_reg_arg(DST_OP) on the dst register, marking it as unbounded. If the src and dst register are the same, this marks the src as unbounded, which can lead to unexpected errors for further checks that rely on bounds info. For example: BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), Results in: "math between ctx pointer and register with unbounded min value is not allowed" check_alu_op() now uses check_reg_arg(DST_OP_NO_MARK), and MOVs that need to mark the dst register (MOVIMM, MOV32) do so. Added a test case for MOV64 dst == src, and dst != src. Signed-off-by: Arthur Fabre Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25e47c195874..e948303a0ea8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3238,8 +3238,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + /* check dest operand, mark as required later */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3265,6 +3265,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R = imm * remember the value we stored into this reg */ + /* clear any state __mark_reg_known doesn't set */ + mark_reg_unknown(env, regs, insn->dst_reg); regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, -- cgit v1.2.3 From 2c323017e381c55c5ce2a603b8305bb18c1162cc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2018 12:39:04 -0400 Subject: blk-cgroup: clear the throttle queue on fork We were hitting a panic in production where we put too many times on the request queue. This is because we'd get the throttle_queue of the parent if we fork()'ed while we needed to be throttled, but we didn't have a reference on it. Instead just clear these flags on fork so the child doesn't pay for the sins of its father. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..694ae0e56866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -843,6 +843,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->fail_nth = 0; #endif +#ifdef CONFIG_BLK_CGROUP + tsk->throttle_queue = NULL; + tsk->use_memdelay = 0; +#endif + return tsk; free_stack: -- cgit v1.2.3 From 87a4c375995ed8eaa721b08825cf73d0b02b3145 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jul 2018 13:39:32 +0200 Subject: kconfig: include kernel/Kconfig.preempt from init/Kconfig Almost all architectures include it. Add a ARCH_NO_PREEMPT symbol to disable preempt support for alpha, hexagon, non-coldfire m68k and user mode Linux. Signed-off-by: Christoph Hellwig Signed-off-by: Masahiro Yamada --- kernel/Kconfig.preempt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 3f9c97419f02..cd1655122ec0 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -18,6 +18,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_NO_PREEMPT help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -35,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + depends on !ARCH_NO_PREEMPT select PREEMPT_COUNT select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help -- cgit v1.2.3 From 363e934d8811d799c88faffc5bfca782fd728334 Mon Sep 17 00:00:00 2001 From: Gaurav Kohli Date: Thu, 2 Aug 2018 14:21:03 +0530 Subject: timers: Clear timer_base::must_forward_clk with timer_base::lock held timer_base::must_forward_clock is indicating that the base clock might be stale due to a long idle sleep. The forwarding of the base clock takes place in the timer softirq or when a timer is enqueued to a base which is idle. If the enqueue of timer to an idle base happens from a remote CPU, then the following race can happen: CPU0 CPU1 run_timer_softirq mod_timer base = lock_timer_base(timer); base->must_forward_clk = false if (base->must_forward_clk) forward(base); -> skipped enqueue_timer(base, timer, idx); -> idx is calculated high due to stale base unlock_timer_base(timer); base = lock_timer_base(timer); forward(base); The root cause is that timer_base::must_forward_clk is cleared outside the timer_base::lock held region, so the remote queuing CPU observes it as cleared, but the base clock is still stale. This can cause large granularity values for timers, i.e. the accuracy of the expiry time suffers. Prevent this by clearing the flag with timer_base::lock held, so that the forwarding takes place before the cleared flag is observable by a remote CPU. Signed-off-by: Gaurav Kohli Signed-off-by: Thomas Gleixner Cc: john.stultz@linaro.org Cc: sboyd@kernel.org Cc: linux-arm-msm@vger.kernel.org Link: https://lkml.kernel.org/r/1533199863-22748-1-git-send-email-gkohli@codeaurora.org --- kernel/time/timer.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index baa528a24a73..fa49cd753dea 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1657,6 +1657,22 @@ static inline void __run_timers(struct timer_base *base) raw_spin_lock_irq(&base->lock); + /* + * timer_base::must_forward_clk must be cleared before running + * timers so that any timer functions that call mod_timer() will + * not try to forward the base. Idle tracking / clock forwarding + * logic is only used with BASE_STD timers. + * + * The must_forward_clk flag is cleared unconditionally also for + * the deferrable base. The deferrable base is not affected by idle + * tracking and never forwarded, so clearing the flag is a NOOP. + * + * The fact that the deferrable base is never forwarded can cause + * large variations in granularity for deferrable timers, but they + * can be deferred for long periods due to idle anyway. + */ + base->must_forward_clk = false; + while (time_after_eq(jiffies, base->clk)) { levels = collect_expired_timers(base, heads); @@ -1676,19 +1692,6 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - /* - * must_forward_clk must be cleared before running timers so that any - * timer functions that call mod_timer will not try to forward the - * base. idle trcking / clock forwarding logic is only used with - * BASE_STD timers. - * - * The deferrable base does not do idle tracking at all, so we do - * not forward it. This can result in very large variations in - * granularity for deferrable timers, but they can be deferred for - * long periods due to idle. - */ - base->must_forward_clk = false; - __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -- cgit v1.2.3 From 234b3840d73430564a03f53973a311b7a83a95a9 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 11 Jul 2018 12:24:23 +0100 Subject: tick/broadcast-hrtimer: Use cpu_possible_mask for ce_broadcast_hrtimer This is the last instance of cpu_all_mask usage in the core framework. Replace it with cpu_possible_mask like all other instances in the clockevent drivers. This makes it possible to add a warning in the core clockevents_register_device on usage of cpu_all_mask from any clockevent drivers in the future. Signed-off-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/1531308264-24220-2-git-send-email-sudeep.holla@arm.com --- kernel/time/tick-broadcast-hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 58045eb976c3..a59641fb88b6 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -90,7 +90,7 @@ static struct clock_event_device ce_broadcast_hrtimer = { .max_delta_ticks = ULONG_MAX, .mult = 1, .shift = 0, - .cpumask = cpu_all_mask, + .cpumask = cpu_possible_mask, }; static enum hrtimer_restart bc_handler(struct hrtimer *t) -- cgit v1.2.3 From fbfa9260085b5b578a049a90135e5c51928c5f7f Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 11 Jul 2018 12:24:24 +0100 Subject: clockevents: Warn if cpu_all_mask is used as cpumask Using cpu_all_mask in clockevents cpumask may result in issues while comparing multiple clockevent devices to choose the preferred one. On one of the platforms with 2 system (i.e. non per-CPU) timers with different ratings, having cpu_all_mask for one of the device resulted in a boot hang due to a endless loop in clockevents_notify_released() as both were clocksources were selected as preferred. In order to prevent such issues in the future, warn if any clockevent driver sets cpu_all_mask as it's cpumask and just override it to use cpu_possible_mask. All the existing occurrences of cpu_all_mask are already replaced with cpu_possible_mask. Signed-off-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/1531308264-24220-3-git-send-email-sudeep.holla@arm.com --- kernel/time/clockevents.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 16c027e9cc73..8c0e4092f661 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -463,6 +463,12 @@ void clockevents_register_device(struct clock_event_device *dev) dev->cpumask = cpumask_of(smp_processor_id()); } + if (dev->cpumask == cpu_all_mask) { + WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", + dev->name); + dev->cpumask = cpu_possible_mask; + } + raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); -- cgit v1.2.3 From b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Jul 2018 13:21:40 +0200 Subject: stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: isaacm@codeaurora.org Cc: matt@codeblueprint.co.uk Cc: psodagud@codeaurora.org Cc: gregkh@linuxfoundation.org Cc: pkondeti@codeaurora.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.GH2494@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* + * The waking up of stopper threads has to happen in the same + * scheduling context as the queueing. Otherwise, there is a + * possibility of one of the above stoppers being woken up by another + * CPU, and preempting us. This will cause us to not wake up the other + * stopper forever. + */ + preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); - /* - * The waking up of stopper threads has to happen - * in the same scheduling context as the queueing. - * Otherwise, there is a possibility of one of the - * above stoppers being woken up by another CPU, - * and preempting us. This will cause us to n ot - * wake up the other stopper forever. - */ - preempt_disable(); + unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(&wakeq); - preempt_enable(); - } + wake_up_q(&wakeq); + preempt_enable(); return err; } -- cgit v1.2.3 From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba0f8ea..7958252a4d29 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,32 +181,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + return 0; +} - atomic_long_add(map->pages, &user->locked_vm); +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} + +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); + ret = bpf_charge_memlock(user, map->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } map->user = user; - return 0; + return ret; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -256,7 +284,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -492,7 +520,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -515,7 +543,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: -- cgit v1.2.3 From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 1 + kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 12 ++ 4 files changed, 392 insertions(+) create mode 100644 kernel/bpf/local_storage.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f27f5496d6fe..e8906cbad81f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,6 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..f23d3fdeba23 --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,376 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7958252a4d29..5af4e9e2722d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e948303a0ea8..7e75434a9e54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } -- cgit v1.2.3 From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index f23d3fdeba23..fc4e37f68f2a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,6 +7,8 @@ #include #include +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ -- cgit v1.2.3 From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index badabb0b435c..935274c86bfe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; @@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: @@ -276,6 +298,9 @@ cleanup: /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ -- cgit v1.2.3 From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 21 ++++++++------- kernel/bpf/core.c | 76 +++++++++++++++++++++++++++-------------------------- 2 files changed, 51 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 935274c86bfe..ddfa6cc13e57 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); rcu_assign_pointer(*array, progs); return 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 253aa8e79c7b..9abcf25ebf9f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1542,7 +1542,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } -- cgit v1.2.3 From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e75434a9e54..1ede16c8bb40 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; -- cgit v1.2.3 From 7b5dd2bde72cd33313b63cf3ba1de6a9e443a65d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:23 -0700 Subject: bpf: don't allow create maps of cgroup local storages As there is one-to-one relation between a bpf program and cgroup local storage map, there is no sense in creating a map of cgroup local storage maps. Forbid it explicitly to avoid possible side effects. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..3bfbf4464416 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -23,7 +23,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * is a runtime binding. Doing static check alone * in the verifier is not enough. */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } -- cgit v1.2.3 From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9abcf25ebf9f..4d09e610777f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ede16c8bb40..587468a9c37d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); -- cgit v1.2.3 From 4f7799d96e6621ce584df60739e1480a6fd89f0a Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 22 Jun 2018 10:01:26 -0700 Subject: genirq/irqchip: Remove MULTI_IRQ_HANDLER as it's now obselete Now that every user of MULTI_IRQ_HANDLER has been convereted over to use GENERIC_IRQ_MULTI_HANDLER remove the references to MULTI_IRQ_HANDLER. Signed-off-by: Palmer Dabbelt Signed-off-by: Thomas Gleixner Cc: linux@armlinux.org.uk Cc: catalin.marinas@arm.com Cc: Will Deacon Cc: jonas@southpole.se Cc: stefan.kristiansson@saunalahti.fi Cc: shorne@gmail.com Cc: jason@lakedaemon.net Cc: marc.zyngier@arm.com Cc: Arnd Bergmann Cc: nicolas.pitre@linaro.org Cc: vladimir.murzin@arm.com Cc: keescook@chromium.org Cc: jinb.park7@gmail.com Cc: yamada.masahiro@socionext.com Cc: alexandre.belloni@bootlin.com Cc: pombredanne@nexb.com Cc: Greg KH Cc: kstewart@linuxfoundation.org Cc: jhogan@kernel.org Cc: mark.rutland@arm.com Cc: ard.biesheuvel@linaro.org Cc: james.morse@arm.com Cc: linux-arm-kernel@lists.infradead.org Cc: openrisc@lists.librecores.org Link: https://lkml.kernel.org/r/20180622170126.6308-6-palmer@sifive.com --- kernel/irq/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c6766f326072..5f3e2baefca9 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -134,7 +134,6 @@ config GENERIC_IRQ_DEBUGFS endmenu config GENERIC_IRQ_MULTI_HANDLER - depends on !MULTI_IRQ_HANDLER bool help Allow to specify the low level IRQ handler at run time. -- cgit v1.2.3 From 1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Thu, 2 Aug 2018 23:09:41 -0700 Subject: watchdog: Reduce message verbosity Code is emitting the following error message during boot on systems without PMU hardware support while probing NMI capability. NMI watchdog: Perf event create on CPU 0 failed with -2 This error is emitted as the perf subsystem returns -ENOENT due to lack of PMUs in the system. It is followed by the warning that NMI watchdog is disabled: NMI watchdog: Perf NMI watchdog permanently disabled While NMI disabled information is useful for ordinary users, seeing a PERF event create failed with error code -2 is not. Reduce the message severity to debug so that if debugging is still possible in case the error code returned by perf is required for analysis. Signed-off-by: Sinan Kaya Signed-off-by: Thomas Gleixner Acked-by: Don Zickus Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Colin Ian King Cc: Peter Zijlstra Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=599368 Link: https://lkml.kernel.org/r/20180803060943.2643-1-okaya@kernel.org --- kernel/watchdog_hld.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index e449a23e9d59..1f7020d65d0a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void) evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (IS_ERR(evt)) { - pr_info("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return PTR_ERR(evt); } this_cpu_write(watchdog_ev, evt); -- cgit v1.2.3 From cfd355145c32bb7ccb65fccbe2d67280dc2119e1 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Fri, 3 Aug 2018 13:56:06 -0700 Subject: stop_machine: Atomically queue and wake stopper threads When cpu_stop_queue_work() releases the lock for the stopper thread that was queued into its wake queue, preemption is enabled, which leads to the following deadlock: CPU0 CPU1 sched_setaffinity(0, ...) __set_cpus_allowed_ptr() stop_one_cpu(0, ...) stop_two_cpus(0, 1, ...) cpu_stop_queue_work(0, ...) cpu_stop_queue_two_works(0, ..., 1, ...) -grabs lock for migration/0- -spins with preemption disabled, waiting for migration/0's lock to be released- -adds work items for migration/0 and queues migration/0 to its wake_q- -releases lock for migration/0 and preemption is enabled- -current thread is preempted, and __set_cpus_allowed_ptr has changed the thread's cpu allowed mask to CPU1 only- -acquires migration/0 and migration/1's locks- -adds work for migration/0 but does not add migration/0 to wake_q, since it is already in a wake_q- -adds work for migration/1 and adds migration/1 to its wake_q- -releases migration/0 and migration/1's locks, wakes migration/1, and enables preemption- -since migration/1 is requested to run, migration/1 begins to run and waits on migration/0, but migration/0 will never be able to run, since the thread that can wake it is affine to CPU1- Disable preemption in cpu_stop_queue_work() before queueing works for stopper threads, and queueing the stopper thread in the wake queue, to ensure that the operation of queueing the works and waking the stopper threads is atomic. Fixes: 0b26351b910f ("stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock") Signed-off-by: Prasad Sodagudi Signed-off-by: Isaac J. Manjarres Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: matt@codeblueprint.co.uk Cc: bigeasy@linutronix.de Cc: gregkh@linuxfoundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1533329766-4856-1-git-send-email-isaacm@codeaurora.org Co-Developed-by: Isaac J. Manjarres --- kernel/stop_machine.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..69eb76daed34 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -81,6 +81,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; bool enabled; + preempt_disable(); raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) @@ -90,6 +91,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_unlock_irqrestore(&stopper->lock, flags); wake_up_q(&wakeq); + preempt_enable(); return enabled; } -- cgit v1.2.3 From 82837ad5bda79cd13d64b9abad92e7872a06bc65 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 21:12:00 -0500 Subject: PM / hibernate: Mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This addresses Coverity-ID: 114713 ("Missing break in switch"). Signed-off-by: Gustavo A. R. Silva Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9c85c7822383..38f3f96b4b12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -638,6 +638,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); + /* Fall through */ case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); -- cgit v1.2.3 From 55f2503c3b69328735e88031ff8d6ba291bd952b Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Tue, 31 Jul 2018 16:51:32 +0800 Subject: PM / reboot: Eliminate race between reboot and suspend At present, "systemctl suspend" and "shutdown" can run in parrallel. A system can suspend after devices_shutdown(), and resume. Then the shutdown task goes on to power off. This causes many devices are not really shut off. Hence replacing reboot_mutex with system_transition_mutex (renamed from pm_mutex) to achieve the exclusion. The renaming of pm_mutex as system_transition_mutex can be better to reflect the purpose of the mutex. Signed-off-by: Pingfan Liu Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 4 +++- kernel/power/hibernate.c | 15 ++++++++------- kernel/power/main.c | 12 ++++++------ kernel/power/suspend.c | 4 ++-- kernel/power/user.c | 4 ++-- kernel/reboot.c | 6 +++--- 6 files changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 6f56a9e219fa..b162b74611e4 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -15,7 +15,9 @@ atomic_t system_freezing_cnt = ATOMIC_INIT(0); EXPORT_SYMBOL(system_freezing_cnt); -/* indicate whether PM freezing is in effect, protected by pm_mutex */ +/* indicate whether PM freezing is in effect, protected by + * system_transition_mutex + */ bool pm_freezing; bool pm_nosig_freezing; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 38f3f96b4b12..abef759de7c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,7 @@ static int create_image(int platform_mode) * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. + * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { @@ -500,8 +500,9 @@ static int resume_target_kernel(bool platform_mode) * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snapshot(). + * This routine must be called with system_transition_mutex held. If it is + * successful, control reappears in the restored target kernel in + * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -806,13 +807,13 @@ static int software_resume(void) * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock + * calling hibernate functions (which take system_transition_mutex) + * this can cause lockdep to complain about a possible ABBA deadlock * which cannot happen since we're in the boot code here and * sysfs can't be invoked yet. Therefore, we use a subclass * here to avoid lockdep complaining. */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); if (swsusp_resume_device) goto Check_image; @@ -900,7 +901,7 @@ static int software_resume(void) atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: diff --git a/kernel/power/main.c b/kernel/power/main.c index d9706da10930..35b50823d83b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,17 +15,16 @@ #include #include #include +#include #include "power.h" -DEFINE_MUTEX(pm_mutex); - #ifdef CONFIG_PM_SLEEP void lock_system_sleep(void) { current->flags |= PF_FREEZER_SKIP; - mutex_lock(&pm_mutex); + mutex_lock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(lock_system_sleep); @@ -37,8 +36,9 @@ void unlock_system_sleep(void) * * Reason: * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the pm_mutex lock, - * since the freezer always works with pm_mutex held. + * doesn't come into effect until we release the + * system_transition_mutex lock, since the freezer always works with + * system_transition_mutex held. * * More importantly, in the case of hibernation, * unlock_system_sleep() gets called in snapshot_read() and @@ -47,7 +47,7 @@ void unlock_system_sleep(void) * enter the refrigerator, thus causing hibernation to lockup. */ current->flags &= ~PF_FREEZER_SKIP; - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..9e13afe65a14 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -556,7 +556,7 @@ static int enter_state(suspend_state_t state) } else if (!valid_state(state)) { return -EINVAL; } - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; if (state == PM_SUSPEND_TO_IDLE) @@ -590,7 +590,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index abd225550271..2d8b60a3c86b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -216,7 +216,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); @@ -394,7 +394,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } unlock_device_hotplug(); - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/reboot.c b/kernel/reboot.c index e4ced883d8de..8fb44dec9ad7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -294,7 +294,7 @@ void kernel_power_off(void) } EXPORT_SYMBOL_GPL(kernel_power_off); -static DEFINE_MUTEX(reboot_mutex); +DEFINE_MUTEX(system_transition_mutex); /* * Reboot system call: for obvious reasons only root may call it, @@ -338,7 +338,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - mutex_lock(&reboot_mutex); + mutex_lock(&system_transition_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -389,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - mutex_unlock(&reboot_mutex); + mutex_unlock(&system_transition_mutex); return ret; } -- cgit v1.2.3 From bc2d8d262cba5736332cbc866acb11b1c5748aa9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Aug 2018 08:19:57 +0200 Subject: cpu/hotplug: Fix SMT supported evaluation Josh reported that the late SMT evaluation in cpu_smt_state_init() sets cpu_smt_control to CPU_SMT_NOT_SUPPORTED in case that 'nosmt' was supplied on the kernel command line as it cannot differentiate between SMT disabled by BIOS and SMT soft disable via 'nosmt'. That wreckages the state and makes the sysfs interface unusable. Rework this so that during bringup of the non boot CPUs the availability of SMT is determined in cpu_smt_allowed(). If a newly booted CPU is not a 'primary' thread then set the local cpu_smt_available marker and evaluate this explicitely right after the initial SMP bringup has finished. SMT evaulation on x86 is a trainwreck as the firmware has all the information _before_ booting the kernel, but there is no interface to query it. Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") Reported-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++------------- kernel/smp.c | 2 ++ 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 2bc3d2f5b2a5..9d2512dd263c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -347,6 +347,8 @@ EXPORT_SYMBOL_GPL(cpu_hotplug_enable); enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); +static bool cpu_smt_available __read_mostly; + void __init cpu_smt_disable(bool force) { if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || @@ -363,14 +365,28 @@ void __init cpu_smt_disable(bool force) /* * The decision whether SMT is supported can only be done after the full - * CPU identification. Called from architecture code. + * CPU identification. Called from architecture code before non boot CPUs + * are brought up. */ -void __init cpu_smt_check_topology(void) +void __init cpu_smt_check_topology_early(void) { if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; } +/* + * If SMT was disabled by BIOS, detect it here, after the CPUs have been + * brought online. This ensures the smt/l1tf sysfs entries are consistent + * with reality. cpu_smt_available is set to true during the bringup of non + * boot CPUs when a SMT sibling is detected. Note, this may overwrite + * cpu_smt_control's previous setting. + */ +void __init cpu_smt_check_topology(void) +{ + if (!cpu_smt_available) + cpu_smt_control = CPU_SMT_NOT_SUPPORTED; +} + static int __init smt_cmdline_disable(char *str) { cpu_smt_disable(str && !strcmp(str, "force")); @@ -380,10 +396,18 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (topology_is_primary_thread(cpu)) return true; - if (topology_is_primary_thread(cpu)) + /* + * If the CPU is not a 'primary' thread and the booted_once bit is + * set then the processor has SMT support. Store this information + * for the late check of SMT support in cpu_smt_check_topology(). + */ + if (per_cpu(cpuhp_state, cpu).booted_once) + cpu_smt_available = true; + + if (cpu_smt_control == CPU_SMT_ENABLED) return true; /* @@ -2125,15 +2149,6 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_state_init(void) { - /* - * If SMT was disabled by BIOS, detect it here, after the CPUs have - * been brought online. This ensures the smt/l1tf sysfs entries are - * consistent with reality. Note this may overwrite cpu_smt_control's - * previous setting. - */ - if (topology_max_smt_threads() == 1) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; - return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } diff --git a/kernel/smp.c b/kernel/smp.c index 084c8b3a2681..d86eec5f51c1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,6 +584,8 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); + /* Final decision about SMT support */ + cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } -- cgit v1.2.3 From 85fc4b16aaf05fc8978d242c556a1711dce15cf8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Aug 2018 14:27:28 -0700 Subject: bpf: introduce update_effective_progs() __cgroup_bpf_attach() and __cgroup_bpf_detach() functions have a good amount of duplicated code, which is possible to eliminate by introducing the update_effective_progs() helper function. The update_effective_progs() calls compute_effective_progs() and then in case of success it calls activate_effective_progs() for each descendant cgroup. In case of failure (OOM), it releases allocated prog arrays and return the error code. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 99 ++++++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a4fe5a7dc91..6a7d931bbc55 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -177,6 +177,45 @@ cleanup: return -ENOMEM; } +static int update_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type) +{ + struct cgroup_subsys_state *css; + int err; + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return err; +} + #define BPF_CGROUP_MAX_PROGS 64 /** @@ -194,7 +233,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage, *old_storage = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -261,22 +299,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = flags; - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); if (old_storage) @@ -289,16 +314,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and cleanup the prog list */ pl->prog = old_prog; bpf_cgroup_storage_free(pl->storage); @@ -326,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; int err; @@ -365,22 +379,9 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; } - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -396,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and restore back old_prog */ pl->prog = old_prog; return err; -- cgit v1.2.3 From dc1508a579e682a1e5f1ed0753390e0aa7c23a97 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:19 -0700 Subject: bpf: fix bpffs non-array map seq_show issue In function map_seq_next() of kernel/bpf/inode.c, the first key will be the "0" regardless of the map type. This works for array. But for hash type, if it happens key "0" is in the map, the bpffs map show will miss some items if the key "0" is not the first element of the first bucket. This patch fixed the issue by guaranteeing to get the first element, if the seq_show is just started, by passing NULL pointer key to map_get_next_key() callback. This way, no missing elements will occur for bpffs hash table show even if key "0" is in the map. Fixes: a26ca7c982cb5 ("bpf: btf: Add pretty print support to the basic arraymap") Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 76efe9a183f5..fc5b103512e7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } -- cgit v1.2.3 From 699c86d6ec21d0f885d12800249d138659de8489 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:20 -0700 Subject: bpf: btf: add pretty print for hash/lru_hash maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") added pretty print support to array map. This patch adds pretty print for hash and lru_hash maps. The following example shows the pretty-print result of a pinned hashmap: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_hash_map: 87907: {87907,87908} 57354: {37354,57355} 76625: {76625,76626} ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 513d9dfcf4ee..d6110042e0d9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include +#include #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -1162,6 +1164,44 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1171,6 +1211,8 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1182,6 +1224,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ -- cgit v1.2.3 From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 3 + kernel/bpf/arraymap.c | 2 +- kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + 4 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/reuseport_array.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..f6ca3e712831 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..57f4d076141b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); -- cgit v1.2.3 From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } -- cgit v1.2.3 From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- kernel/bpf/arraymap.c | 26 ++++++++++++-------------- kernel/bpf/cpumap.c | 1 + kernel/bpf/devmap.c | 1 + kernel/bpf/hashtab.c | 20 +------------------- kernel/bpf/inode.c | 3 ++- kernel/bpf/local_storage.c | 1 + kernel/bpf/lpm_trie.c | 12 ++++++++++++ kernel/bpf/sockmap.c | 2 ++ kernel/bpf/stackmap.c | 1 + kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++++++---- kernel/bpf/xskmap.c | 3 +-- 11 files changed, 66 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f6ca3e712831..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..3b494941a44a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d361fc1e3bf3..a7c6620d8389 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -484,6 +484,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d6110042e0d9..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1185,23 +1185,6 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) -{ - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; - - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) - return -EINVAL; - - return 0; -} - const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1212,7 +1195,6 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1225,7 +1207,6 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ @@ -1452,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fc5b103512e7..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -334,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include +#include #include #include #include #include #include +#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0b38be5a955c..e376ffffebce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2495,6 +2495,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2505,6 +2506,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57f4d076141b..43727ed0d94a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - -- cgit v1.2.3 From 93cb8e20d56be40c541475f77b5f565fbb385a4b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 1 Jul 2018 18:18:15 +0200 Subject: parisc: Drop architecture-specific ENOTSUP define parisc is the only Linux architecture which has defined a value for ENOTSUP. All other architectures #define ENOTSUP as EOPNOTSUPP in their libc headers. Having an own value for ENOTSUP which is different than EOPNOTSUPP often gives problems with userspace programs which expect both to be the same. One such example is a build error in the libuv package, as can be seen in https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=900237. Since we dropped HP-UX support, there is no real benefit in keeping an own value for ENOTSUP. This patch drops the parisc value for ENOTSUP from the kernel sources. glibc needs no patch, it reuses the exported headers. Signed-off-by: Helge Deller --- kernel/time/posix-timers.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e08ce3f27447..66321cbdf90c 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -85,15 +85,6 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * parisc wants ENOTSUP instead of EOPNOTSUPP - */ -#ifndef ENOTSUP -# define ENANOSLEEP_NOTSUP EOPNOTSUPP -#else -# define ENANOSLEEP_NOTSUP ENOTSUP -#endif - /* * The timer ID is turned into a timer address by idr_find(). * Verifying a valid ID consists of: @@ -1220,7 +1211,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; if (get_timespec64(&t, rqtp)) return -EFAULT; @@ -1247,7 +1238,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc) return -EINVAL; if (!kc->nsleep) - return -ENANOSLEEP_NOTSUP; + return -EOPNOTSUPP; if (compat_get_timespec64(&t, rqtp)) return -EFAULT; -- cgit v1.2.3 From 269777aa530f3438ec1781586cdac0b5fe47b061 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Wed, 15 Aug 2018 00:26:00 +0300 Subject: cpu/hotplug: Non-SMP machines do not make use of booted_once Commit 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") breaks non-SMP builds. [ I suspect the 'bool' fields should just be made to be bitfields and be exposed regardless of configuration, but that's a separate cleanup that I'll leave to the owners of this file for later. - Linus ] Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") Cc: Dave Hansen Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Abel Vesa Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 90ec528a6e32..ed44d7d34c2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2278,6 +2278,8 @@ void __init boot_cpu_init(void) */ void __init boot_cpu_hotplug_init(void) { +#ifdef CONFIG_SMP this_cpu_write(cpuhp_state.booted_once, true); +#endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit v1.2.3