From a2775bbc1d58ce630517dfe86090c166f27d719f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 12 Mar 2019 21:21:26 +0100 Subject: kernel/workqueue: Use __printf markup to silence compiler in function 'alloc_workqueue' Silence warnings (triggered at W=1) by adding relevant __printf attributes. kernel/workqueue.c:4249:2: warning: function 'alloc_workqueue' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] Signed-off-by: Mathieu Malaterre Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..56b7cf898f10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4208,6 +4208,7 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } +__printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) -- cgit v1.2.3 From 95e0b46fcebd7dbf6850dee96046e4c4ddc7f69c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 7 Mar 2019 09:16:24 +0800 Subject: audit: fix a memleak caused by auditing load module module.name will be allocated unconditionally when auditing load module, and audit_log_start() can fail with other reasons, or audit_log_exit maybe not called, caused module.name is not freed so free module.name in audit_free_context and __audit_syscall_exit unreferenced object 0xffff88af90837d20 (size 8): comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s) hex dump (first 8 bytes): 69 78 67 62 65 00 ff ff ixgbe... backtrace: [<0000000008da28fe>] __audit_log_kern_module+0x33/0x80 [<00000000c1491e61>] load_module+0x64f/0x3850 [<000000007fc9ae3f>] __do_sys_init_module+0x218/0x250 [<0000000000d4a478>] do_syscall_64+0x117/0x400 [<000000004924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000007dc331dd>] 0xffffffffffffffff Fixes: ca86cad7380e3 ("audit: log module name on init_module") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing [PM: manual merge fixup in __audit_syscall_exit()] Signed-off-by: Paul Moore --- kernel/auditsc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..fa7b8047aab8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -840,6 +840,13 @@ static inline void audit_proctitle_free(struct audit_context *context) context->proctitle.len = 0; } +static inline void audit_free_module(struct audit_context *context) +{ + if (context->type == AUDIT_KERN_MODULE) { + kfree(context->module.name); + context->module.name = NULL; + } +} static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -923,6 +930,7 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); free_tree_refs(context); @@ -1266,7 +1274,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); } else audit_log_format(ab, "(null)"); @@ -1697,6 +1704,7 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); -- cgit v1.2.3 From 8194fe94ab08f56ea55653df924647d23873d18c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Mar 2019 10:45:09 -0700 Subject: kernel/workqueue: Document wq_worker_last_func() argument This patch avoids that the following warning is reported when building with W=1: kernel/workqueue.c:938: warning: Function parameter or member 'task' not described in 'wq_worker_last_func' Signed-off-by: Bart Van Assche Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56b7cf898f10..21721faa923c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -913,6 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) /** * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. -- cgit v1.2.3 From 73e65b88feb919f95bdb77c4ed35f69588cf27ee Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 19 Mar 2019 15:23:29 -0400 Subject: audit: connect LOGIN record to its syscall record Currently the AUDIT_LOGIN event is a standalone record that isn't connected to any other records that may be part of its syscall event. To avoid the confusion of generating two events, connect the records by using its syscall context. Please see the github issue https://github.com/linux-audit/audit-kernel/issues/110 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c89ea48c70a6..b96bf69183f4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2220,7 +2220,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; -- cgit v1.2.3 From 2efa48fec0c344a6ca1bba66b15d63d38cf20199 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 21:59:22 +0800 Subject: audit: Make audit_log_cap and audit_copy_inode static Fix sparse warning: kernel/auditsc.c:1150:6: warning: symbol 'audit_log_cap' was not declared. Should it be static? kernel/auditsc.c:1908:6: warning: symbol 'audit_copy_inode' was not declared. Should it be static? Signed-off-by: YueHaibing Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fa7b8047aab8..17b0007fafc2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1147,7 +1147,8 @@ out: kfree(buf_head); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +static void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap) { int i; @@ -1905,8 +1906,9 @@ static inline int audit_copy_fcaps(struct audit_names *name, } /* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode, unsigned int flags) +static void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; -- cgit v1.2.3 From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 18 Mar 2019 02:30:18 +0300 Subject: syscall_get_arch: add "struct task_struct *" argument This argument is required to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going to be called from ptrace_request() along with syscall_get_nr(), syscall_get_arguments(), syscall_get_error(), and syscall_get_return_value() functions with a tracee as their argument. The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6) should describe what system call is being called and what its arguments are. Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments") Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()") Reviewed-by: Andy Lutomirski # for x86 Reviewed-by: Palmer Dabbelt Acked-by: Paul Moore Acked-by: Paul Burton # MIPS parts Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook # seccomp parts Acked-by: Mark Salter # for the c6x bit Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Oleg Nesterov Cc: x86@kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: linux-hexagon@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-riscv@lists.infradead.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Dmitry V. Levin Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- kernel/seccomp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17b0007fafc2..98a98e6dca05 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..36f36ab00f48 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); + sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, 0, 6, args); sd->args[0] = args[0]; sd->args[1] = args[1]; @@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } -- cgit v1.2.3 From 0f3adc288df8ba2ac2fea0a8e4890f9fb4cd075d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:53:59 +0800 Subject: bpf: track references based on is_acquire_func So far, the verifier only acquires reference tracking state for RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every new return type which desires these semantics, acquire reference tracking state iff the called helper is an acquire function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86f9cd5d1c4e..868a82ad5597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3147,19 +3147,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3170,9 +3158,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); -- cgit v1.2.3 From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 868a82ad5597..a476e13201d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3148,6 +3148,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; -- cgit v1.2.3 From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476e13201d6..dffeec3706ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) -- cgit v1.2.3 From d7dcf26ff0ffd7b56fe2b09ed7f1867589f3cdf1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Mar 2019 23:48:21 +0100 Subject: softirq: Remove tasklet_hrtimer There are no more users of this interface. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: David S. Miller Cc: netdev@vger.kernel.org Link: https://lkml.kernel.org/r/20190301224821.29843-4-bigeasy@linutronix.de --- kernel/softirq.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t) } EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ - struct tasklet_hrtimer *ttimer = - container_of(timer, struct tasklet_hrtimer, timer); - - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ - struct tasklet_hrtimer *ttimer = (void *)data; - enum hrtimer_restart restart; - - restart = ttimer->function(&ttimer->timer); - if (restart != HRTIMER_NORESTART) - hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode) -{ - hrtimer_init(&ttimer->timer, which_clock, mode); - ttimer->timer.function = __hrtimer_tasklet_trampoline; - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, - (unsigned long)ttimer); - ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); - void __init softirq_init(void) { int cpu; -- cgit v1.2.3 From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..1b942a7caf26 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -650,16 +650,37 @@ static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, + /* policy enforced later */ + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, + /* policy enforced later */ + .flags = GENL_CMD_CAP_HASPOL, }, }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + const struct nla_policy *policy = NULL; + + switch (ops->cmd) { + case TASKSTATS_CMD_GET: + policy = taskstats_cmd_get_policy; + break; + case CGROUPSTATS_CMD_GET: + policy = cgroupstats_cmd_get_policy; + break; + default: + return -EINVAL; + } + + return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX, + policy, info->extack); +} + static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, @@ -667,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ -- cgit v1.2.3 From e1e41b6ce5f9c1a80bf4f2404ec5ab11c6c5a2ad Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 18 Mar 2019 20:55:56 +0100 Subject: timekeeping: Consistently use unsigned int for seqcount snapshot The timekeeping code uses a random mix of "unsigned long" and "unsigned int" for the seqcount snapshots (ratio 14:12). Since the seqlock.h API is entirely based on unsigned int, use that throughout. Signed-off-by: Rasmus Villemoes Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: John Stultz Cc: Stephen Boyd Link: https://lkml.kernel.org/r/20190318195557.20773-1-linux@rasmusvillemoes.dk --- kernel/time/jiffies.c | 2 +- kernel/time/sched_clock.c | 4 ++-- kernel/time/tick-common.c | 2 +- kernel/time/tick-sched.c | 3 ++- kernel/time/timekeeping.c | 18 +++++++++--------- 5 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..95f8f3304c19 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { - unsigned long seq; + unsigned int seq; u64 ret; do { diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..16b80c2b4fe8 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { u64 cyc, res; - unsigned long seq; + unsigned int seq; struct clock_read_data *rd; do { @@ -267,7 +267,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned long seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..561641b2153f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -149,7 +149,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { - unsigned long seq; + unsigned int seq; ktime_t next; do { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..b50f6f22c88e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -645,7 +645,8 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; - unsigned long seq, basejiff; + unsigned long basejiff; + unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..540145da33da 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; - unsigned long seq; + unsigned int seq; ktime_t tconv; do { @@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void) void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; ktime_t base_raw; ktime_t base_real; u64 nsec_raw; @@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn) ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; - unsigned long seq; + unsigned int seq; bool do_interp; int ret; @@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock) void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; do { @@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64); int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; int ret; do { @@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void) u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); -- cgit v1.2.3 From 1b72d43237980eab9b6ae6bb8181e51c840377e6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2019 16:39:20 +0100 Subject: tick: Remove outgoing CPU from broadcast masks Valentin reported that unplugging a CPU occasionally results in a warning in the tick broadcast code which is triggered when an offline CPU is in the broadcast mask. This happens because the outgoing CPU is not removing itself from the broadcast masks, especially not from the broadcast_force_mask. The removal happens on the control CPU after the outgoing CPU is dead. It's a long standing issue, but the warning is harmless. Rework the hotplug mechanism so that the outgoing CPU removes itself from the broadcast masks after disabling interrupts and removing itself from the online mask. Reported-by: Valentin Schneider Signed-off-by: Thomas Gleixner Tested-by: Valentin Schneider Cc: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903211540180.1784@nanos.tec.linutronix.de --- kernel/cpu.c | 2 ++ kernel/time/clockevents.c | 18 ++++++++++++++++-- kernel/time/tick-broadcast.c | 40 +++++++++++++++++++--------------------- kernel/time/tick-internal.h | 10 ++++++---- 4 files changed, 43 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 025f419d16f6..f69ba38573c2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -844,6 +844,8 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(cpu); /* Park the stopper thread */ stop_machine_park(cpu); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void) } #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu: The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); + raw_spin_unlock(&clockevents_lock); +} +# endif + /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu) raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_shutdown_broadcast_oneshot(cpu); - tick_shutdown_broadcast(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee834d4fb814..0283523de045 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,12 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +static void tick_broadcast_oneshot_offline(unsigned int cpu); #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } #endif /* @@ -433,27 +435,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) } #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void) { - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_broadcast_mask); - cpumask_clear_cpu(cpu, tick_broadcast_on); + struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } +} - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ + raw_spin_lock(&tick_broadcast_lock); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + tick_broadcast_oneshot_offline(cpu); + tick_shutdown_broadcast(); + raw_spin_unlock(&tick_broadcast_lock); } + #endif void tick_suspend_broadcast(void) @@ -950,14 +954,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu) { - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! @@ -965,8 +965,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); -- cgit v1.2.3 From d6b87eaf10bd061914f6d277d7428b3285d8850e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:18 +0100 Subject: tick/sched: Update tick_sched struct documentation Adapt the documentation order of struct members to the effective order of struct members and add missing descriptions. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/20190321120921.16463-2-anna-maria@linutronix.de --- kernel/time/tick-sched.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode + * @check_clocks: Notification mechanism about clocksource changes + * @nohz_mode: Mode - one state of tick_nohz_mode + * @inidle: Indicator that the CPU is in the tick idle mode + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_active: Indicator that the CPU is actively in the tick idle mode; + * it is resetted during irq handling phases. + * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. - * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode { * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires - * @do_timer_lst: CPU was the last one doing do_timer before going idle - * @got_idle_tick: Tick timer function has run with @inidle set + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick */ struct tick_sched { struct hrtimer sched_timer; -- cgit v1.2.3 From dc1e7dc5ac6254ba0502323381a7ec847e408f1d Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:19 +0100 Subject: timer: Move trace point to get proper index When placing the timer_start trace point before the timer wheel bucket index is calculated, the index information in the trace point is useless. It is not possible to simply move the debug_activate() call after the index calculation, because debug_object_activate() needs to be called before touching the object. Therefore split debug_activate() and move the trace point into enqueue_timer() after the new index has been calculated. The debug_object_activate() call remains at the original place. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Cc: Steven Rostedt Link: https://lkml.kernel.org/r/20190321120921.16463-3-anna-maria@linutronix.de --- kernel/time/timer.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..8d7918ae4d0c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); } static void @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer) trace_timer_init(timer); } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires, timer->flags); -} - static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } - debug_activate(timer, expires); + debug_timer_activate(timer); timer->expires = expires; /* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu) } forward_timer_base(base); - debug_activate(timer, timer->expires); + debug_timer_activate(timer); internal_add_timer(base, timer); raw_spin_unlock_irqrestore(&base->lock, flags); } -- cgit v1.2.3 From f28d3d5346e97e60c81f933ac89ccf015430e5cf Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Mar 2019 13:09:21 +0100 Subject: timer/trace: Improve timer tracing Timers are added to the timer wheel off by one. This is required in case a timer is queued directly before incrementing jiffies to prevent early timer expiry. When reading a timer trace and relying only on the expiry time of the timer in the timer_start trace point and on the now in the timer_expiry_entry trace point, it seems that the timer fires late. With the current timer_expiry_entry trace point information only now=jiffies is printed but not the value of base->clk. This makes it impossible to draw a conclusion to the index of base->clk and makes it impossible to examine timer problems without additional trace points. Therefore add the base->clk value to the timer_expire_entry trace point, to be able to calculate the index the timer base is located at during collecting expired timers. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Cc: peterz@infradead.org Cc: Steven Rostedt Link: https://lkml.kernel.org/r/20190321120921.16463-5-anna-maria@linutronix.de --- kernel/time/timer.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8d7918ae4d0c..a9b1bbc2d88d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1293,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, + void (*fn)(struct timer_list *), + unsigned long baseclk) { int count = preempt_count(); @@ -1316,7 +1318,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list */ lock_map_acquire(&lockdep_map); - trace_timer_expire_entry(timer); + trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); @@ -1337,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list static void expire_timers(struct timer_base *base, struct hlist_head *head) { + /* + * This value is required only for tracing. base->clk was + * incremented directly before expire_timers was called. But expiry + * is related to the old base->clk value. + */ + unsigned long baseclk = base->clk - 1; + while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); @@ -1350,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); } } -- cgit v1.2.3 From 59c39840f5abf4a71e1810a8da71aaccd6c17d26 Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Sun, 24 Mar 2019 07:57:04 -0700 Subject: genirq: Prevent use-after-free and work list corruption When irq_set_affinity_notifier() replaces the notifier, then the reference count on the old notifier is dropped which causes it to be freed. But nothing ensures that the old notifier is not longer queued in the work list. If it is queued this results in a use after free and possibly in work list corruption. Ensure that the work is canceled before the reference is dropped. Signed-off-by: Prasad Sodagudi Signed-off-by: Thomas Gleixner Cc: marc.zyngier@arm.com Link: https://lkml.kernel.org/r/1553439424-6529-1-git-send-email-psodagud@codeaurora.org --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1401afa0d58a..53a081392115 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); - if (old_notify) + if (old_notify) { + cancel_work_sync(&old_notify->work); kref_put(&old_notify->kref, old_notify->release); + } return 0; } -- cgit v1.2.3 From e85e6a21b2b5f31148cc3f2e785262b37c3e1ec7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Jan 2019 15:30:15 -0800 Subject: rcu: Unconditionally expedite during suspend/hibernate The rcu_pm_notify() function refuses to switch to/from expedited grace periods on systems with more than 256 CPUs due to the serialized initialization of expedited grace periods. However, expedited grace periods are now initialized in parallel, removing this concern. This commit therefore removes the checks from rcu_pm_notify(), so that expedited grace periods are used unconditionally during suspend/resume and hibernate/wake operations. As always, real-time workloads wishing to completely avoid expedited grace periods can use the rcupdate.rcu_normal= kernel parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..95e3250b7b6e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3559,13 +3559,11 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedite_gp(); + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); break; default: break; -- cgit v1.2.3 From 671a63517cf983ad8eaa324167165cef245ab744 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 19 Jan 2019 11:14:18 -0500 Subject: rcu: Avoid unnecessary softirq when system is idle When there are no callbacks pending on an idle system, I noticed that RCU softirq is continuously firing. During this the cpu_no_qs is set to false, and core_needs_qs is set to true indefinitely. This causes rcu_process_callbacks to be repeatedly called, even though the node corresponding to the CPU has that CPU's mask bit cleared and the system is idle. I believe the race is when such mask clearing is done during idle CPU scan of the quiescent state forcing stage in the kthread instead of the softirq. Since the rnp mask is cleared, but the flags on the CPU's rdp are not cleared, the CPU thinks it still needs to report to core RCU. Cure this by clearing the core_needs_qs flag when the CPU detects that its node is already updated which will avoid the unwanted softirq raises to the benefit of real-time systems. Test: Ran rcutorture for various tree RCU configs. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 95e3250b7b6e..2f78a115d34c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2296,6 +2296,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { + rdp->core_needs_qs = false; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { rdp->core_needs_qs = false; -- cgit v1.2.3 From 18d7e40679ef574e428f893101be1c0035e95ee3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 24 Jan 2019 21:14:37 +0300 Subject: rcu: rcu_qs -- Use raise_softirq_irqoff to not save irqs twice The rcu_qs is disabling IRQs by self so no need to do the same in raise_softirq but instead we can save some cycles using raise_softirq_irqoff directly. CC: Paul E. McKenney Signed-off-by: Cyrill Gorcunov Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 911bd9076d43..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -52,7 +52,7 @@ void rcu_qs(void) local_irq_save(flags); if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; - raise_softirq(RCU_SOFTIRQ); + raise_softirq_irqoff(RCU_SOFTIRQ); } local_irq_restore(flags); } -- cgit v1.2.3 From 884157cef0acf05648fe921d80c680afababb428 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Feb 2019 07:21:29 -0800 Subject: rcu: Make exit_rcu() handle non-preempted RCU readers The purpose of exit_rcu() is to handle cases where buggy code causes a task to exit within an RCU read-side critical section. It currently does that in the case where said RCU read-side critical section was preempted at least once, but fails to handle cases where preemption did not occur. This case needs to be handled because otherwise the final context switch away from the exiting task will incorrectly behave as if task exit were instead a preemption of an RCU read-side critical section, and will therefore queue the exiting task. The exiting task will have exited, and thus won't ever execute rcu_read_unlock(), which means that it will remain queued forever, blocking all subsequent grace periods, and eventually resulting in OOM. Although this is arguably better than letting grace periods proceed and having a later rcu_read_unlock() access the now-freed task structure that once belonged to the exiting tasks, it would obviously be better to correctly handle this case. This commit therefore sets ->rcu_read_lock_nesting to 1 in that case, so that the subsequence call to __rcu_read_unlock() causes the exiting task to exit its dangling RCU read-side critical section. Note that deferred quiescent states need not be considered. The reason is that removing the task from the ->blkd_tasks[] list in the call to rcu_preempt_deferred_qs() handles the per-task component of any deferred quiescent state, and all other components of any deferred quiescent state are associated with the CPU, which isn't going anywhere until some later CPU-hotplug operation, which will report any remaining deferred quiescent states from within the rcu_report_dead() function. Note also that negative values of ->rcu_read_lock_nesting need not be considered. First, these won't show up in exit_rcu() unless there is a serious bug in RCU, and second, setting ->rcu_read_lock_nesting sets the state so that the RCU read-side critical section will be exited normally. Again, this code has no effect unless there has been some prior bug that prevents a task from leaving an RCU read-side critical section before exiting. Furthermore, there have been no reports of the bug fixed by this commit appearing in production. This commit is therefore absolutely -not- recommended for backporting to -stable. Reported-by: ABHISHEK DUBEY Reported-by: BHARATH Y MOURYA Reported-by: Aravinda Prasad Signed-off-by: Paul E. McKenney Tested-by: ABHISHEK DUBEY --- kernel/rcu/tree_plugin.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..d408661d5fb7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -804,19 +804,25 @@ static void rcu_flavor_sched_clock_irq(int user) /* * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so. No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort. Spewing warnings from this + * function is like as not to simply obscure important prior warnings. */ void exit_rcu(void) { struct task_struct *t = current; - if (likely(list_empty(¤t->rcu_node_entry))) + if (unlikely(!list_empty(¤t->rcu_node_entry))) { + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special.b.blocked = true; + } else if (unlikely(t->rcu_read_lock_nesting)) { + t->rcu_read_lock_nesting = 1; + } else { return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special.b.blocked = true; + } __rcu_read_unlock(); rcu_preempt_deferred_qs(current); } -- cgit v1.2.3 From 3ffe3d1adc0b6cfb9b24db9995a537bb0aa30a8b Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 21 Feb 2019 22:13:27 +0800 Subject: rcu: Set rcutree.kthread_prio sysfs access to read-only The rcutree.kthread_prio kernel-boot parameter is used to set the priority for boost (rcub), per-CPU (rcuc), and grace-period (rcu_preempt or rcu_sched) kthreads. It is also used by rcutorture to check whether it is possible to meaningfully test RCU priority boosting. However, all of these cases will either ignore or be confused by any post-boot changes to rcutree.kthread_prio. Note that the user really can change the priorities of all of these kthreads using chrt, given sufficient privileges. Therefore, the read-write nature of sysfs access to rcutree.kthread_prio is thus at best an attractive nuisance. This commit therefore changes sysfs access to rcutree.kthread_prio to be read-only. Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2f78a115d34c..296131450414 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,7 @@ static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ -- cgit v1.2.3 From b2eb85b49a576515fb845cb12568b173c2bedffc Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sat, 2 Mar 2019 17:25:19 +0900 Subject: rcu: Move common code out of if-else block As the result of recent addition of "rdp->core_needs_qs = false;" in the "if" block, now both branches of the if-else have the same assignment. Factor it out and reduce line count. Signed-off-by: Akira Yokosawa Cc: Joel Fernandes Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 296131450414..5aefd36ac648 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2295,12 +2295,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { - rdp->core_needs_qs = false; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = false; - /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. -- cgit v1.2.3 From da8739f23fadf05809c6c37c327367b229467045 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Mar 2019 15:28:19 -0800 Subject: rcu: Allow rcu_nocbs= to specify all CPUs Currently, the rcu_nocbs= kernel boot parameter requires that a specific list of CPUs be specified, and has no way to say "all of them". As noted by user RavFX in a comment to Phoronix topic 1002538, this is an inconvenient side effect of the removal of the RCU_NOCB_CPU_ALL Kconfig option. This commit therefore enables the rcu_nocbs= kernel boot parameter to be given the string "all", as in "rcu_nocbs=all" to specify that all CPUs on the system are to have their RCU callbacks offloaded. Another approach would be to make cpulist_parse() check for "all", but there are uses of cpulist_parse() that do other checking, which could conflict with an "all". This commit therefore focuses on the specific use of cpulist_parse() in rcu_nocb_setup(). Just a note to other people who would like changes to Linux-kernel RCU: If you send your requests to me directly, they might get fixed somewhat faster. RavFX's comment was posted on January 22, 2018 and I first saw it on March 5, 2019. And the only reason that I found it -at- -all- was that I was looking for projects using RCU, and my search engine showed me that Phoronix comment quite by accident. Your choice, though! ;-) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d408661d5fb7..ed4a6dabf31d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1776,7 +1776,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - cpulist_parse(str, rcu_nocb_mask); + if (!strcasecmp(str, "all")) + cpumask_setall(rcu_nocb_mask); + else + cpulist_parse(str, rcu_nocb_mask); return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); -- cgit v1.2.3 From 497e42600b69aca1b799c840d2cfc7ad60bb8017 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2019 14:47:56 -0800 Subject: rcu: Report error for bad rcu_nocbs= parameter values This commit prints a console message when cpulist_parse() reports a bad list of CPUs, and sets all CPUs' bits in that case. The reason for setting all CPUs' bits is that this is the safe(r) choice for real-time workloads, which would normally be the ones using the rcu_nocbs= kernel boot parameter. Either way, later RCU console log messages list the actual set of CPUs whose RCU callbacks will be offloaded. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed4a6dabf31d..f0aeb7416dcc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1772,14 +1772,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges. If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); if (!strcasecmp(str, "all")) cpumask_setall(rcu_nocb_mask); else - cpulist_parse(str, rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); -- cgit v1.2.3 From 0f58d2ac2c87d27006c2b610668ebc4ff1f7c3ba Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 8 Mar 2019 15:16:18 +0530 Subject: rcu: Fix self-wakeups for grace-period kthread The current rcu_gp_kthread_wake() function uses in_interrupt() and thus does a self-wakeup from all interrupt contexts, including the pointless case where the GP kthread happens to be running with bottom halves disabled, along with the impossible case where the GP kthread is running within an NMI handler (you are not supposed to invoke rcu_gp_kthread_wake() from within an NMI handler. This commit therefore replaces the in_interrupt() with in_irq(), so that the self-wakeups happen only from handlers for hardware interrupts and softirqs. This also makes the code match the comment. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5aefd36ac648..139fa1f5c537 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1585,7 +1585,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) static void rcu_gp_kthread_wake(void) { if ((current == rcu_state.gp_kthread && - !in_interrupt() && !in_serving_softirq()) || + !in_irq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; -- cgit v1.2.3 From 6973032a602ee678c98644a30d57ebf9c72dd6d3 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 11 Mar 2019 15:16:11 +0530 Subject: rcu: Default jiffies_to_sched_qs to jiffies_till_sched_qs The current code only calls adjust_jiffies_till_sched_qs() if jiffies_till_sched_qs is left at its default value, so when the jiffies_till_sched_qs kernel-boot parameter actually is specified, jiffies_to_sched_qs will be left with the value zero, which will result in useless slowdowns of cond_resched(). This commit therefore changes rcu_init_geometry() to unconditionally invoke adjust_jiffies_till_sched_qs(), which ensures that jiffies_to_sched_qs will be initialized in all cases, thus maintaining good cond_resched() performance. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 139fa1f5c537..466299c3d2da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3739,8 +3739,7 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; - if (jiffies_till_sched_qs == ULONG_MAX) - adjust_jiffies_till_sched_qs(); + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && -- cgit v1.2.3 From 85f2b60c4321b088ba08ec9a05b8a7b68e4ada2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Mar 2019 15:45:13 -0700 Subject: rcu: Update jiffies_to_sched_qs and adjust_jiffies_till_sched_qs() comments This commit better documents the jiffies_to_sched_qs default-value strategy used by adjust_jiffies_till_sched_qs() Reported-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 466299c3d2da..e117732bcd5d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -406,7 +406,7 @@ static bool rcu_kick_kthreads; */ static ulong jiffies_till_sched_qs = ULONG_MAX; module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ /* @@ -424,6 +424,7 @@ static void adjust_jiffies_till_sched_qs(void) WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); return; } + /* Otherwise, set to third fqs scan, but bound below on large system. */ j = READ_ONCE(jiffies_till_first_fqs) + 2 * READ_ONCE(jiffies_till_next_fqs); if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) -- cgit v1.2.3 From 5d8a752e31aaa4c9703f201956a40b45ed791217 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 20 Mar 2019 03:33:00 +0000 Subject: rcu: Fix force_qs_rnp() header comment Previously, threads blocked on offlining CPUS were migrated to the root rcu_node structure, thus requiring RCU priority boosting on this structure. However, since commit d19fb8d1f3f6 ("rcu: Don't migrate blocked tasks even if all corresponding CPUs offline"), RCU does not migrate blocked tasks. Consequently, RCU no longer does RCU priority boosting on the root rcu_node structure as of commit 1be0085b515e ("rcu: Don't initiate RCU priority boosting on root rcu_node"). This commit therefore brings comments for the force_qs_rnp() function's header comment in line with this new no-root-boosting reality. Signed-off-by: Zhouyi Zhou [ paulmck: Also remove obsolete comment on suppressing new grace periods. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e117732bcd5d..abc8512ceb5f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2548,11 +2548,11 @@ void rcu_sched_clock_irq(int user) } /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures. For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state. */ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { -- cgit v1.2.3 From a2badefa8574a7a424a72f67a3bd43248141f76a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 16:29:50 -0700 Subject: rcu: Eliminate redundant NULL-pointer check Because rcu_wake_cond() checks for a null task_struct pointer, there is no need for its callers to do so. This commit eliminates the redundant check. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f0aeb7416dcc..81d3cd821891 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1191,8 +1191,6 @@ static int rcu_boost_kthread(void *arg) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - struct task_struct *t; - raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1206,9 +1204,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - t = rnp->boost_kthread_task; - if (t) - rcu_wake_cond(t, rnp->boost_kthread_status); + rcu_wake_cond(rnp->boost_kthread_task, + rnp->boost_kthread_status); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -- cgit v1.2.3 From f1a98045abd824df833354b309b5fa64ff95f792 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 23 Mar 2019 09:19:12 -0700 Subject: rcu: Fix typo in tree_exp.h comment This commit changes a rcu_exp_handler() comment from rcu_preempt_defer_qs() to rcu_preempt_deferred_qs() in order to better match reality. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c2a0189e748..ec4fb93a5dbe 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -648,7 +648,7 @@ static void rcu_exp_handler(void *unused) * * If the CPU is fully enabled (or if some buggy RCU-preempt * read-side critical section is being used from idle), just - * invoke rcu_preempt_defer_qs() to immediately report the + * invoke rcu_preempt_deferred_qs() to immediately report the * quiescent state. We cannot use rcu_read_unlock_special() * because we are in an interrupt handler, which will cause that * function to take an early exit without doing anything. -- cgit v1.2.3 From add0d37b4f1e77de7d170ece43c8d765572a1eab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Mar 2019 10:22:22 -0700 Subject: rcu: Correct READ_ONCE()/WRITE_ONCE() for ->rcu_read_unlock_special The task_struct structure's ->rcu_read_unlock_special field is only ever read or written by the owning task, but it is accessed both at process and interrupt levels. It may therefore be accessed using plain reads and writes while interrupts are disabled, but must be accessed using READ_ONCE() and WRITE_ONCE() or better otherwise. This commit makes a few adjustments to align with this discipline. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index ec4fb93a5dbe..1ee0782213b8 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -633,7 +633,7 @@ static void rcu_exp_handler(void *unused) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->deferred_qs = true; - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 81d3cd821891..6ddb3c05e88f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -285,7 +285,7 @@ static void rcu_qs(void) TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ - current->rcu_read_unlock_special.b.need_qs = false; + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); } } @@ -817,7 +817,7 @@ void exit_rcu(void) if (unlikely(!list_empty(¤t->rcu_node_entry))) { t->rcu_read_lock_nesting = 1; barrier(); - t->rcu_read_unlock_special.b.blocked = true; + WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); } else if (unlikely(t->rcu_read_lock_nesting)) { t->rcu_read_lock_nesting = 1; } else { -- cgit v1.2.3 From 5cdfd174ea6c2dc1d331b61bdc9572698658600a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Feb 2019 10:44:33 -0800 Subject: srcu: Check for in-flight callbacks in _cleanup_srcu_struct() If someone fails to drain the corresponding SRCU callbacks (for example, by failing to invoke srcu_barrier()) before invoking either cleanup_srcu_struct() or cleanup_srcu_struct_quiesced(), the resulting diagnostic is an ambiguous use-after-free diagnostic, and even then only if you are running something like KASAN. This commit therefore improves SRCU diagnostics by adding checks for in-flight callbacks at _cleanup_srcu_struct() time. Note that these diagnostics can still be defeated, for example, by invoking call_srcu() concurrently with cleanup_srcu_struct(). Which is a really bad idea, but sometimes all too easy to do. But even then, these diagnostics have at least some probability of catching the problem. Reported-by: Sagi Grimberg Reported-by: Bart Van Assche Signed-off-by: Paul E. McKenney Tested-by: Bart Van Assche --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a60b8ba9e1ac..4f30f3ecabc1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -387,6 +387,8 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) del_timer_sync(&sdp->delay_work); flush_work(&sdp->work); } + if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) + return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(ssp))) { -- cgit v1.2.3 From f5ad3991493c69d203d42b94d32349b54c58a3f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Feb 2019 13:54:37 -0800 Subject: srcu: Remove cleanup_srcu_struct_quiesced() The cleanup_srcu_struct_quiesced() function was added because NVME used WQ_MEM_RECLAIM workqueues and SRCU did not, which meant that NVME workqueues waiting on SRCU workqueues could result in deadlocks during low-memory conditions. However, SRCU now also has WQ_MEM_RECLAIM workqueues, so there is no longer a potential for deadlock. Furthermore, it turns out to be extremely hard to use cleanup_srcu_struct_quiesced() correctly due to the fact that SRCU callback invocation accesses the srcu_struct structure's per-CPU data area just after callbacks are invoked. Therefore, the usual practice of using srcu_barrier() to wait for callbacks to be invoked before invoking cleanup_srcu_struct_quiesced() fails because SRCU's callback-invocation workqueue handler might be delayed, which can result in cleanup_srcu_struct_quiesced() being invoked (and thus freeing the per-CPU data) before the SRCU's callback-invocation workqueue handler is finished using that per-CPU data. Nor is this a theoretical problem: KASAN emitted use-after-free warnings because of this problem on actual runs. In short, NVME can now safely invoke cleanup_srcu_struct(), which avoids the use-after-free scenario. And cleanup_srcu_struct_quiesced() is quite difficult to use safely. This commit therefore removes cleanup_srcu_struct_quiesced(), switching its sole user back to cleanup_srcu_struct(). This effectively reverts the following pair of commits: f7194ac32ca2 ("srcu: Add cleanup_srcu_struct_quiesced()") 4317228ad9b8 ("nvme: Avoid flush dependency in delete controller flow") Reported-by: Bart Van Assche Signed-off-by: Paul E. McKenney Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche --- kernel/rcu/rcutorture.c | 7 +------ kernel/rcu/srcutiny.c | 9 +++------ kernel/rcu/srcutree.c | 30 ++++++++++++------------------ 3 files changed, 16 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f14d1b18a74f..d2b226110835 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -592,12 +592,7 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - static DEFINE_TORTURE_RANDOM(rand); - - if (torture_random(&rand) & 0x800) - cleanup_srcu_struct(&srcu_ctld); - else - cleanup_srcu_struct_quiesced(&srcu_ctld); + cleanup_srcu_struct(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5d4a39a6505a..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); - if (quiesced) - WARN_ON(work_pending(&ssp->srcu_work)); - else - flush_work(&ssp->srcu_work); + flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 4f30f3ecabc1..9b761e546de8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) return SRCU_INTERVAL; } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; @@ -369,24 +375,12 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - if (quiesced) { - if (WARN_ON(delayed_work_pending(&ssp->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&ssp->work); - } + flush_delayed_work(&ssp->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - if (quiesced) { - if (WARN_ON(timer_pending(&sdp->delay_work))) - return; /* Just leak it! */ - if (WARN_ON(work_pending(&sdp->work))) - return; /* Just leak it! */ - } else { - del_timer_sync(&sdp->delay_work); - flush_work(&sdp->work); - } + del_timer_sync(&sdp->delay_work); + flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } @@ -399,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) free_percpu(ssp->sda); ssp->sda = NULL; } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the -- cgit v1.2.3 From 10462d6f58fb6dbde7563e9343505d98d5bfba3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 16:10:57 -0800 Subject: rcu: Move RCU CPU stall-warning code out of update.c The RCU CPU stall-warning code for normal grace periods is currently scattered across three files, due to earlier Tiny RCU support for RCU CPU stall warnings and for old Kconfig options that have long since been retired. Given that it is hard for the lead RCU maintainer to find relevant stall-warning code, it would be good to consolidate it. This commit starts this process by moving stall-warning code from kernel/rcu/update.c to a new kernel/rcu/tree_stall.h file. Note that the definitions of rcu_cpu_stall_suppress and rcu_cpu_stall_timeout must remain in kernel/rcu/update.h to provide compatibility for kernel boot parameter lists. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 1 + kernel/rcu/tree.c | 1 + kernel/rcu/tree_stall.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 59 +-------------------------------------------- 4 files changed, 66 insertions(+), 58 deletions(-) create mode 100644 kernel/rcu/tree_stall.h (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..4b58c907b4b7 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); #define rcu_ftrace_dump_stall_suppress() \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..424d50ccf9e6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3858,5 +3858,6 @@ void __init rcu_init(void) srcu_init(); } +#include "tree_stall.h" #include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..682189f4d083 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney + */ + + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + WRITE_ONCE(rcu_cpu_stall_timeout, 3); + till_stall_check = 3; + } else if (till_stall_check > 300) { + WRITE_ONCE(rcu_cpu_stall_timeout, 300); + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index cbaa976c5945..c3bf44ba42e5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif #ifdef CONFIG_RCU_STALL_COMMON - -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif - int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ - int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - WRITE_ONCE(rcu_cpu_stall_timeout, 3); - till_stall_check = 3; - } else if (till_stall_check > 300) { - WRITE_ONCE(rcu_cpu_stall_timeout, 300); - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ - if (!rcu_cpu_stall_suppress) - rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ - if (rcu_cpu_stall_suppress == 2) - rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); - return 0; -} -early_initcall(check_cpu_stall_init); - #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_TASKS_RCU -- cgit v1.2.3 From 3fc3d1709fc75995ee09ad4f35f160cf360b397b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 16:34:47 -0800 Subject: rcu: Move RCU CPU stall-warning code out of tree_plugin.h The RCU CPU stall-warning code for normal grace periods is currently scattered across two files, due to earlier Tiny RCU support for RCU CPU stall warnings and for old Kconfig options that have long since been retired. Given that it is hard for the lead RCU maintainer to find relevant stall-warning code, it would be good to consolidate it. This commit continues this process by moving stall-warning code from kernel/rcu/tree_plugin.c to a new kernel/rcu/tree_stall.h file. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 90 --------------------------------------------- kernel/rcu/tree_stall.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..7fa3bc4d481b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,79 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t) rcu_preempt_deferred_qs_irqrestore(t, flags); } -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - /* - * We could be printing a lot while holding a spinlock. - * Avoid triggering hard lockup. - */ - touch_nmi_watchdog(); - sched_show_task(t); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ - struct rcu_node *rnp = rcu_get_root(); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return 0; - rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - rcu_print_task_stall_end(); - return ndetected; -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -979,23 +906,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) } static void rcu_preempt_deferred_qs(struct task_struct *t) { } -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - return 0; -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 682189f4d083..6f5f94944f49 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -61,3 +61,98 @@ static int __init check_cpu_stall_init(void) return 0; } early_initcall(check_cpu_stall_init); + +#ifdef CONFIG_PREEMPT + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period. + */ +static void rcu_print_detail_task_stall(void) +{ + struct rcu_node *rnp = rcu_get_root(); + + rcu_print_detail_task_stall_rnp(rnp); + rcu_for_each_leaf_node(rnp) + rcu_print_detail_task_stall_rnp(rnp); +} + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ + pr_cont("\n"); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return 0; + rcu_print_task_stall_begin(rnp); + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + rcu_print_task_stall_end(); + return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall(void) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT */ -- cgit v1.2.3 From 32255d51b6ed00de2b88970ceea8db0ec3bae6f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 16:57:41 -0800 Subject: rcu: Move RCU CPU stall-warning code out of tree.c This commit completes the process of consolidating the code for RCU CPU stall warnings for normal grace periods by moving the remaining such code from kernel/rcu/tree.c to kernel/rcu/tree_stall.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 291 ----------------------------------------------- kernel/rcu/tree.h | 10 +- kernel/rcu/tree_stall.h | 292 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 299 insertions(+), 294 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 424d50ccf9e6..001dd05f6e38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,8 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; /* Commandeer a sysrq key to dump RCU's tree. */ static bool sysrq_rcu; module_param(sysrq_rcu, bool, 0444); @@ -1167,295 +1165,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static void record_gp_stall_check_time(void) -{ - unsigned long j = jiffies; - unsigned long j1; - - rcu_state.gp_start = j; - j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ - rcu_state.jiffies_resched = j + j1 / 2; - rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ - struct task_struct *gpk = rcu_state.gp_kthread; - unsigned long j; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rcu_state.name, j, - (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); - if (gpk) { - pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(gpk); - wake_up_process(gpk); - } - } -} - -/* - * Dump stacks of all tasks running on stalled CPUs. First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps. The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ - unsigned long j; - - if (!rcu_kick_kthreads) - return; - j = READ_ONCE(rcu_state.jiffies_kick_kthreads); - if (time_after(jiffies, j) && rcu_state.gp_kthread && - (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", - rcu_state.name); - rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rcu_state.gp_kthread); - WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); - } -} - -static void panic_on_rcu_stall(void) -{ - if (sysctl_panic_on_rcu_stall) - panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ - int cpu; - unsigned long flags; - unsigned long gpa; - unsigned long j; - int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); - print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(cpu); - ndetected++; - } - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - if (ndetected) { - rcu_dump_cpu_stacks(); - - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(); - } else { - if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { - pr_err("INFO: Stall ended before state dump start\n"); - } else { - j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), - rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); - } - } - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - - rcu_check_gp_kthread_starvation(); - - panic_on_rcu_stall(); - - rcu_force_quiescent_state(); /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ - int cpu; - unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); - print_cpu_stall_info_begin(); - raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(smp_processor_id()); - raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - - rcu_check_gp_kthread_starvation(); - - rcu_dump_cpu_stacks(); - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - panic_on_rcu_stall(); - - /* - * Attempt to revive the RCU machinery by forcing a context switch. - * - * A context switch would normally allow the RCU state machine to make - * progress and it could be we're stuck in kernel space without context - * switches for an entirely unreasonable amount of time. - */ - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ - unsigned long gs1; - unsigned long gs2; - unsigned long gps; - unsigned long j; - unsigned long jn; - unsigned long js; - struct rcu_node *rnp; - - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress()) - return; - rcu_stall_kick_kthreads(); - j = jiffies; - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rcu_state.gp_seq, then - * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally - * another copy of rcu_state.gp_seq. These values are updated in - * the opposite order with memory barriers (or equivalent) during - * grace-period initialization and cleanup. Now, a false positive - * can occur if we get an new value of rcu_state.gp_start and a old - * value of rcu_state.jiffies_stall. But given the memory barriers, - * the only way that this can happen is if one grace period ends - * and another starts between these two fetches. This is detected - * by comparing the second fetch of rcu_state.gp_seq with the - * previous fetch from rcu_state.gp_seq. - * - * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, - * and rcu_state.gp_start suffice to forestall false positives. - */ - gs1 = READ_ONCE(rcu_state.gp_seq); - smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rcu_state.jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rcu_state.gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rcu_state.gp_seq); - if (gs1 != gs2 || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ - rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); - } -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bb4f995f2d3f..3c4e26fff806 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; int rcu_dynticks_snap(struct rcu_data *rdp); -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); @@ -445,3 +443,9 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); + +/* Forward declarations for tree_stall.h */ +static void rcu_print_detail_task_stall(void); +static int rcu_print_task_stall(struct rcu_node *rnp); +static void record_gp_stall_check_time(void); +static void check_cpu_stall(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6f5f94944f49..e0e73f493363 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,6 +8,9 @@ */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; + #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) #else @@ -156,3 +159,292 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return 0; } #endif /* #else #ifdef CONFIG_PREEMPT */ + +static void record_gp_stall_check_time(void) +{ + unsigned long j = jiffies; + unsigned long j1; + + rcu_state.gp_start = j; + j1 = rcu_jiffies_till_stall_check(); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + unsigned long j; + + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + READ_ONCE(rcu_state.gp_flags), + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { + pr_err("RCU grace-period kthread stack dump:\n"); + sched_show_task(gpk); + wake_up_process(gpk); + } + } +} + +/* + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(void) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); + } +} + +static void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + +static void print_other_cpu_stall(unsigned long gp_seq) +{ + int cpu; + unsigned long flags; + unsigned long gpa; + unsigned long j; + int ndetected = 0; + struct rcu_node *rnp = rcu_get_root(); + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); + print_cpu_stall_info_begin(); + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(cpu); + ndetected++; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + print_cpu_stall_info_end(); + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + if (ndetected) { + rcu_dump_cpu_stacks(); + + /* Complain about tasks blocking the grace period. */ + rcu_print_detail_task_stall(); + } else { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = READ_ONCE(rcu_state.gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", + rcu_state.name, j - gpa, j, gpa, + READ_ONCE(jiffies_till_next_fqs), + rcu_get_root()->qsmask); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + + rcu_check_gp_kthread_starvation(); + + panic_on_rcu_stall(); + + rcu_force_quiescent_state(); /* Kick them all. */ +} + +static void print_cpu_stall(void) +{ + int cpu; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); + print_cpu_stall_info_begin(); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); + print_cpu_stall_info(smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); + print_cpu_stall_info_end(); + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", + jiffies - rcu_state.gp_start, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + + rcu_check_gp_kthread_starvation(); + + rcu_dump_cpu_stacks(); + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + panic_on_rcu_stall(); + + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + +static void check_cpu_stall(struct rcu_data *rdp) +{ + unsigned long gs1; + unsigned long gs2; + unsigned long gps; + unsigned long j; + unsigned long jn; + unsigned long js; + struct rcu_node *rnp; + + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress()) + return; + rcu_stall_kick_kthreads(); + j = jiffies; + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. + * + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. + */ + gs1 = READ_ONCE(rcu_state.gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ + js = READ_ONCE(rcu_state.jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = READ_ONCE(rcu_state.gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rcu_state.gp_seq); + if (gs1 != gs2 || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ + rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + if (rcu_gp_in_progress() && + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(); + + } else if (rcu_gp_in_progress() && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2); + } +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} -- cgit v1.2.3 From 21d0d79ab051bf9facb9960a30e58b93a31c75a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 20:36:45 -0800 Subject: rcu: Inline RCU task stall-warning helper functions The rcu_print_detail_task_stall(), rcu_print_task_stall_begin(), and rcu_print_task_stall_end() functions were defined to allow long-gone Kconfig options to provide an abbreviated RCU CPU stall warning printout. This commit saves a few lines of code by inlining them into their sole callers. While in the area, a useless call of rcu_print_detail_task_stall_rnp() on the root rcu_node structure was eliminated. If there is only one rcu_node structure, its tasks get printed twice, but if there are more, the root rcu_node structure is guaranteed to have an empty list of blocked tasks, hence the uselessness. (Long ago, root rcu_node structures with non-empty ->blkd_tasks lists could happen, but no longer.) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_stall.h | 36 +++++++----------------------------- 2 files changed, 7 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3c4e26fff806..c6df9a13dd06 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -445,7 +445,6 @@ static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); /* Forward declarations for tree_stall.h */ -static void rcu_print_detail_task_stall(void); static int rcu_print_task_stall(struct rcu_node *rnp); static void record_gp_stall_check_time(void); static void check_cpu_stall(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e0e73f493363..b476786b8ef7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -94,30 +94,6 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ - struct rcu_node *rnp = rcu_get_root(); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -129,14 +105,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; - rcu_print_task_stall_begin(rnp); + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } - rcu_print_task_stall_end(); + pr_cont("\n"); return ndetected; } @@ -146,7 +123,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_detail_task_stall(void) +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) { } @@ -253,7 +230,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) unsigned long gpa; unsigned long j; int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(); + struct rcu_node *rnp; long totqlen = 0; /* Kick and suppress, if so configured. */ @@ -291,7 +268,8 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_dump_cpu_stacks(); /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(); + rcu_for_each_leaf_node(rnp) + rcu_print_detail_task_stall_rnp(rnp); } else { if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); -- cgit v1.2.3 From d87cda5094585b7a0f62075de68266cb9c1b35ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 20:51:49 -0800 Subject: rcu: Move rcu_print_task_exp_stall() to tree_exp.h Because expedited CPU stall warnings are contained within the kernel/rcu/tree_exp.h file, rcu_print_task_exp_stall() should live there too. This commit carries out the required code motion. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 32 ++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 31 ------------------------------- 2 files changed, 32 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c2a0189e748..7be3e085ddd6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -10,6 +10,7 @@ #include static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); /* * Record the start of an expedited grace period. @@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu) { } +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; +} + #else /* #ifdef CONFIG_PREEMPT_RCU */ /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu) WARN_ON_ONCE(ret); } +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7fa3bc4d481b..72519c57f656 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,27 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t) rcu_preempt_deferred_qs_irqrestore(t, flags); } -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rnp->exp_tasks) - return 0; - t = list_entry(rnp->exp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - return ndetected; -} - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace @@ -906,16 +885,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) } static void rcu_preempt_deferred_qs(struct task_struct *t) { } -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - return 0; -} - /* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for -- cgit v1.2.3 From 40e69ac7d0c5a19ea14656bc3131c55719baec96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 20:58:58 -0800 Subject: rcu: Inline RCU stall-warning info helper functions The print_cpu_stall_info_begin() and print_cpu_stall_info_end() print a single character each onto the console, and are a holdover from a time when RCU CPU stall warning messages could be abbreviated using a long-gone Kconfig option. This commit therefore adds these single characters to already-printed strings in the calling functions, and then eliminates both print_cpu_stall_info_begin() and print_cpu_stall_info_end(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 -- kernel/rcu/tree_plugin.h | 12 ------------ kernel/rcu/tree_stall.h | 12 ++++-------- 3 files changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c6df9a13dd06..d73472af49e7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -416,9 +416,7 @@ static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 72519c57f656..2df5bb04fd7a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1550,12 +1550,6 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ - pr_cont("\n"); -} - /* * Print out diagnostic information for the specified stalled CPU. * @@ -1606,12 +1600,6 @@ static void print_cpu_stall_info(int cpu) fast_no_hz); } -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ - pr_err("\t"); -} - /* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ static void zero_cpu_stall_ticks(struct rcu_data *rdp) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b476786b8ef7..7ef3b596e45f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -243,8 +243,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); - print_cpu_stall_info_begin(); + pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); @@ -258,10 +257,9 @@ static void print_other_cpu_stall(unsigned long gp_seq) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rcu_state.gp_start), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { @@ -314,15 +312,13 @@ static void print_cpu_stall(void) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); - print_cpu_stall_info_begin(); + pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(smp_processor_id()); raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", jiffies - rcu_state.gp_start, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); -- cgit v1.2.3 From 59b73a27681c5841440391f970a9a085228ba975 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jan 2019 21:05:17 -0800 Subject: rcu: Move FAST_NO_HZ stall-warning code to tree_stall.h This commit further consolidates the stall-warning code by moving print_cpu_stall_info() and its helper functions along with zero_cpu_stall_ticks() to kernel/rcu/tree_stall.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 80 ------------------------------------------------ kernel/rcu/tree_stall.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d73472af49e7..49bf3b00bb50 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -416,7 +416,6 @@ static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info(int cpu); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2df5bb04fd7a..a1f9d7c15bd8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1528,86 +1528,6 @@ static void rcu_cleanup_after_idle(void) #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!rdp->tick_nohz_enabled_snap]); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware. Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ - unsigned long delta; - char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - char *ticks_title; - unsigned long ticks_value; - - /* - * We could be printing a lot while holding a spinlock. Avoid - * triggering hard lockup. - */ - touch_nmi_watchdog(); - - ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); - if (ticks_value) { - ticks_title = "GPs behind"; - } else { - ticks_title = "ticks this GP"; - ticks_value = rdp->ticks_this_gp; - } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", - cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], - "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], - !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : - rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : - "!."[!delta], - ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, - rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); - WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} - #ifdef CONFIG_RCU_NOCB_CPU /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 7ef3b596e45f..19b915380a6f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -223,6 +223,86 @@ static void panic_on_rcu_stall(void) panic("RCU Stall\n"); } +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + + sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + rdp->last_accelerate & 0xffff, jiffies & 0xffff, + ".l"[rdp->all_lazy], + ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], + ".D"[!rdp->tick_nohz_enabled_snap]); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + *cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(int cpu) +{ + unsigned long delta; + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + char *ticks_title; + unsigned long ticks_value; + + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], + ticks_value, ticks_title, + rcu_dynticks_snap(rdp) & 0xfff, + rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + static void print_other_cpu_stall(unsigned long gp_seq) { int cpu; -- cgit v1.2.3 From e23344c2ca42d0083596bb39964675bef00ad691 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 12 Jan 2019 09:35:44 -0800 Subject: rcu: Organize functions in tree_stall.h This commit does only code movement, removal of now-unneeded forward declarations, and addition of comments. It organizes the functions that implement RCU CPU stall warnings for normal grace periods into three categories: 1. Control of RCU CPU stall warnings, including computing timeouts. 2. Interaction of stall warnings with grace periods. 3. Actual printing of the RCU CPU stall-warning messages. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_stall.h | 180 ++++++++++++++++++++++++++---------------------- 2 files changed, 97 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 49bf3b00bb50..099410dbcbe9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -442,6 +442,5 @@ static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); /* Forward declarations for tree_stall.h */ -static int rcu_print_task_stall(struct rcu_node *rnp); static void record_gp_stall_check_time(void); static void check_cpu_stall(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 19b915380a6f..03ed47883d8a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,9 @@ * Author: Paul E. McKenney */ +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; @@ -17,6 +20,7 @@ int sysctl_panic_on_rcu_stall __read_mostly; #define RCU_STALL_DELAY_DELTA 0 #endif +/* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); @@ -36,6 +40,7 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { if (!rcu_cpu_stall_suppress) @@ -48,6 +53,7 @@ void rcu_sysrq_end(void) rcu_cpu_stall_suppress = 0; } +/* Don't print RCU CPU stall warnings during a kernel panic. */ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { rcu_cpu_stall_suppress = 1; @@ -65,6 +71,78 @@ static int __init check_cpu_stall_init(void) } early_initcall(check_cpu_stall_init); +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ + unsigned long j = jiffies; + unsigned long j1; + + rcu_state.gp_start = j; + j1 = rcu_jiffies_till_stall_check(); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); + } +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + #ifdef CONFIG_PREEMPT /* @@ -137,43 +215,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } #endif /* #else #ifdef CONFIG_PREEMPT */ -static void record_gp_stall_check_time(void) -{ - unsigned long j = jiffies; - unsigned long j1; - - rcu_state.gp_start = j; - j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ - rcu_state.jiffies_resched = j + j1 / 2; - rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ - struct task_struct *gpk = rcu_state.gp_kthread; - unsigned long j; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rcu_state.name, j, - (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); - if (gpk) { - pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(gpk); - wake_up_process(gpk); - } - } -} - /* * Dump stacks of all tasks running on stalled CPUs. First try using * NMIs, but fall back to manual remote stack tracing on architectures @@ -196,33 +237,6 @@ static void rcu_dump_cpu_stacks(void) } } -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ - unsigned long j; - - if (!rcu_kick_kthreads) - return; - j = READ_ONCE(rcu_state.jiffies_kick_kthreads); - if (time_after(jiffies, j) && rcu_state.gp_kthread && - (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", - rcu_state.name); - rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rcu_state.gp_kthread); - WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); - } -} - -static void panic_on_rcu_stall(void) -{ - if (sysctl_panic_on_rcu_stall) - panic("RCU Stall\n"); -} - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -295,12 +309,26 @@ static void print_cpu_stall_info(int cpu) fast_no_hz); } -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) +/* Complain about starvation of grace-period kthread. */ +static void rcu_check_gp_kthread_starvation(void) { - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); - WRITE_ONCE(rdp->last_fqs_resched, jiffies); + struct task_struct *gpk = rcu_state.gp_kthread; + unsigned long j; + + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + READ_ONCE(rcu_state.gp_flags), + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { + pr_err("RCU grace-period kthread stack dump:\n"); + sched_show_task(gpk); + wake_up_process(gpk); + } + } } static void print_other_cpu_stall(unsigned long gp_seq) @@ -488,17 +516,3 @@ static void check_cpu_stall(struct rcu_data *rdp) print_other_cpu_stall(gs2); } } - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} -- cgit v1.2.3 From 7ac1907c9e7ba5f80d3c298fe9d2dbf620566a49 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 Jan 2019 10:19:20 -0800 Subject: rcu: Move irq-disabled stall-warning checking to tree_stall.h The rcu_iw_handler() function's sole purpose in life is to indicate whether a stalled CPU had interrupts disabled, so it belongs in kernel/rcu/tree_stall.h. This commit therefore makes that move, clarifying its header comment while in the area. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 --------------------- kernel/rcu/tree.h | 1 + kernel/rcu/tree_stall.h | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 001dd05f6e38..929531ed168c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1031,27 +1031,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) return 0; } -/* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning. Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = container_of(iwp, struct rcu_data, rcu_iw); - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); - if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gp_seq = rnp->gp_seq; - rdp->rcu_iw_pending = false; - } - raw_spin_unlock_rcu_node(rnp); -} - /* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 099410dbcbe9..f882ce3ca5a5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -443,4 +443,5 @@ static void rcu_dynticks_task_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 03ed47883d8a..526e223e41ce 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -139,6 +139,26 @@ static void rcu_stall_kick_kthreads(void) } } +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling. Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gp_seq = rnp->gp_seq; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + ////////////////////////////////////////////////////////////////////////////// // // Printing RCU CPU stall warnings -- cgit v1.2.3 From b51bcbbf16ef0ea352e8b924dd8638112e4037a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Jan 2019 07:01:33 -0800 Subject: rcu: Move forward-progress checkers into tree_stall.h This commit further consolidates stall-warning functionality by moving forward-progress checkers into kernel/rcu/tree_stall.h, updating a comment or two while in the area. More specifically, this commit moves show_rcu_gp_kthreads(), rcu_check_gp_start_stall(), rcu_fwd_progress_check(), sysrq_rcu, sysrq_show_rcu(), sysrq_rcudump_op, and rcu_sysrq_init() from kernel/rcu/tree.c to kernel/rcu/tree_stall.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 166 ---------------------------------------------- kernel/rcu/tree.h | 2 + kernel/rcu/tree_stall.h | 171 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 166 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 929531ed168c..4cb1ebd93b0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,9 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* Commandeer a sysrq key to dump RCU's tree. */ -static bool sysrq_rcu; -module_param(sysrq_rcu, bool, 0444); /* * The rcu_scheduler_active variable is initialized to the value @@ -510,74 +507,6 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } -/* - * Show the state of the grace-period kthreads. - */ -void show_rcu_gp_kthreads(void) -{ - int cpu; - unsigned long j; - unsigned long ja; - unsigned long jr; - unsigned long jw; - struct rcu_data *rdp; - struct rcu_node *rnp; - - j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", - rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); - rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)rnp->gp_seq, - (long)rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) - continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)rdp->gp_seq_needed); - } - } - /* sched_show_task(rcu_state.gp_kthread); */ -} -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); - -/* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) -{ - show_rcu_gp_kthreads(); -} - -static struct sysrq_key_op sysrq_rcudump_op = { - .handler = sysrq_show_rcu, - .help_msg = "show-rcu(y)", - .action_msg = "Show RCU tree", - .enable_mask = SYSRQ_ENABLE_DUMP, -}; - -static int __init rcu_sysrq_init(void) -{ - if (sysrq_rcu) - return register_sysrq_key('y', &sysrq_rcudump_op); - return 0; -} -early_initcall(rcu_sysrq_init); - /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -2323,101 +2252,6 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, - const unsigned long gpssdelay) -{ - unsigned long flags; - unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(); - static atomic_t warned = ATOMIC_INIT(0); - - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) - return; - j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) - return; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - /* Hold onto the leaf lock to make others see warned==1. */ - - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || - atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - WARN_ON(1); - if (rnp_root != rnp) - raw_spin_unlock_rcu_node(rnp_root); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - show_rcu_gp_kthreads(); -} - -/* - * Do a forward-progress check for rcutorture. This is normally invoked - * due to an OOM event. The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) -{ - unsigned long cbs; - int cpu; - unsigned long max_cbs = 0; - int max_cpu = -1; - struct rcu_data *rdp; - - if (rcu_gp_in_progress()) { - pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); - show_rcu_gp_kthreads(); - } else { - pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); - preempt_disable(); - rdp = this_cpu_ptr(&rcu_data); - rcu_check_gp_start_stall(rdp->mynode, rdp, j); - preempt_enable(); - } - for_each_possible_cpu(cpu) { - cbs = rcu_get_n_cbs_cpu(cpu); - if (!cbs) - continue; - if (max_cpu < 0) - pr_info("%s: callbacks", __func__); - pr_cont(" %d: %lu", cpu, cbs); - if (cbs <= max_cbs) - continue; - max_cbs = cbs; - max_cpu = cpu; - } - if (max_cpu >= 0) - pr_cont("\n"); -} -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); - /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(struct softirq_action *unused) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f882ce3ca5a5..e253d11af3c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -445,3 +445,5 @@ static void rcu_dynticks_task_exit(void); static void record_gp_stall_check_time(void); static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 526e223e41ce..9e3db08d02bc 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -536,3 +536,174 @@ static void check_cpu_stall(struct rcu_data *rdp) print_other_cpu_stall(gs2); } } + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including of callback invocation. + + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + int cpu; + unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long jw; + struct rcu_data *rdp; + struct rcu_node *rnp; + + j = jiffies; + ja = j - READ_ONCE(rcu_state.gp_activity); + jr = j - READ_ONCE(rcu_state.gp_req_activity); + jw = j - READ_ONCE(rcu_state.gp_wake_time); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), + (long)READ_ONCE(rcu_state.gp_seq), + (long)READ_ONCE(rcu_get_root()->gp_seq_needed), + READ_ONCE(rcu_state.gp_flags)); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", + rnp->grplo, rnp->grphi, (long)rnp->gp_seq, + (long)rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rcu_state.gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)rdp->gp_seq_needed); + } + } + /* sched_show_task(rcu_state.gp_kthread); */ +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, rcu_state.gp_req_activity + gpssdelay) || + time_before(j, rcu_state.gp_activity + gpssdelay) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); + show_rcu_gp_kthreads(); + } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ + show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); -- cgit v1.2.3 From 6c70e9cd5f3c6d93f3e1da6d101073e898f39170 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 8 Mar 2019 11:57:48 -0800 Subject: rcu: Fix nohz status in stall warning The Documentation/RCU/stallwarn.txt file says that stall warnings print "D" if dyntick-idle processing is enabled, but the code in print_cpu_stall_fast_no_hz() prints "." instead. This commit therefore reverses the sense of the test to make the code match the documentation. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 9e3db08d02bc..f65a73a97323 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -267,7 +267,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) rdp->last_accelerate & 0xffff, jiffies & 0xffff, ".l"[rdp->all_lazy], ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!rdp->tick_nohz_enabled_snap]); + ".D"[!!rdp->tick_nohz_enabled_snap]); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3 From 24aca4aea4f0179e0e56cf9ec610c27d07702945 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Jan 2019 19:23:00 -0800 Subject: torture: Don't try to offline the last CPU If there is only one online CPU, it doesn't make sense to try to offline it, as any such attempt is guaranteed to fail. This commit therefore check for this condition and refuses to attempt the nonsensical. Reported-by: Su Yue Signed-off-by: Paul E. McKenney Tested-By: Su Yue --- kernel/torture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 8faa1a9aaeb9..17b2be9bde12 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; + if (num_online_cpus() <= 1) + return false; /* Can't offline the last CPU. */ if (verbose > 1) pr_alert("%s" TORTURE_FLAG -- cgit v1.2.3 From a3b0e1e59ef1757488ef05b66bc376eaf7b06ada Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Feb 2019 15:06:13 -0800 Subject: rcutorture: Make rcutorture_extend_mask() comment match the code The code actually rarely uses more than one type of RCU read-side protection, as is actually desired given that we need some reasonable probability of preempting RCU read-side critical sections, which cannot happen with multiple types of protection. This comment therefore adjusts the comment. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f14d1b18a74f..2453229ba15a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1160,7 +1160,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long randmask2 = randmask1 >> 3; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Most of the time lots of bits, half the time only one bit. */ + /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else -- cgit v1.2.3 From f47cb1bb0da23162f4c17e0c0023df4889ecb492 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Feb 2019 15:59:43 -0800 Subject: rcutorture: Remove ->ext_irq_conflict field Back when there was a separate RCU-bh flavor, the ->ext_irq_conflict field was used to prevent executing local_bh_enable() while interrupts were disabled. However, there is no longer an RCU-bh flavor, so this commit removes the no-longer-needed ->ext_irq_conflict field. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2453229ba15a..21ab3c7eb221 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,7 +299,6 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; - int ext_irq_conflict; const char *name; }; @@ -1170,10 +1169,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & cur_ops->ext_irq_conflict) && - (oldmask & cur_ops->ext_irq_conflict)) - mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ return mask ?: RCUTORTURE_RDR_RCU; } -- cgit v1.2.3 From d44ac1bebc47e46d62019808a893582f56496a98 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Sat, 9 Mar 2019 00:40:45 +0530 Subject: rcutorture: Fix expected forward progress duration in OOM notifier The rcutorture_oom_notify() function has a misplaced close parenthesis that results in increasingly long delays in rcu_fwd_progress_check()'s checking for various RCU forward-progress problems. This commit therefore puts the parenthesis in the right place. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 21ab3c7eb221..b42682b94cb7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1843,7 +1843,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", -- cgit v1.2.3 From b813afae7ab6a5e91b4e16cc567331d9c2ae1f04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 09:27:28 -0700 Subject: rcutorture: Fix cleanup path for invalid torture_type strings If the specified rcutorture.torture_type is not in the rcu_torture_init() function's torture_ops[] array, rcutorture prints some console messages and then invokes rcu_torture_cleanup() to set state so that a future torture test can run. However, rcu_torture_cleanup() also attempts to end the test that didn't actually start, and in doing so relies on the value of cur_ops, a value that is not particularly relevant in this case. This can result in confusing output or even follow-on failures due to attempts to use facilities that have not been properly initialized. This commit therefore sets the value of cur_ops to NULL in this case and inserts a check near the beginning of rcu_torture_cleanup(), thus avoiding relying on an irrelevant cur_ops value. Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b42682b94cb7..e3c0f57ab0aa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2089,6 +2089,10 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } + if (!cur_ops) { + torture_cleanup_end(); + return; + } rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); @@ -2262,6 +2266,7 @@ rcu_torture_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { -- cgit v1.2.3 From ad092c027713a68a34168942a5ef422e42e039f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 10:26:41 -0700 Subject: rcuperf: Fix cleanup path for invalid perf_type strings If the specified rcuperf.perf_type is not in the rcu_perf_init() function's perf_ops[] array, rcuperf prints some console messages and then invokes rcu_perf_cleanup() to set state so that a future torture test can run. However, rcu_perf_cleanup() also attempts to end the test that didn't actually start, and in doing so relies on the value of cur_ops, a value that is not particularly relevant in this case. This can result in confusing output or even follow-on failures due to attempts to use facilities that have not been properly initialized. This commit therefore sets the value of cur_ops to NULL in this case and inserts a check near the beginning of rcu_perf_cleanup(), thus avoiding relying on an irrelevant cur_ops value. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c29761152874..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -494,6 +494,10 @@ rcu_perf_cleanup(void) if (torture_cleanup_begin()) return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) @@ -614,6 +618,7 @@ rcu_perf_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->init) -- cgit v1.2.3 From a9d6938ddb7f892552013b93e4842fc1a538628d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 13:30:01 -0700 Subject: locktorture: NULL cxt.lwsa and cxt.lrsa to allow bad-arg detection Currently, lock_torture_cleanup() uses the values of cxt.lwsa and cxt.lrsa to detect bad parameters that prevented locktorture from initializing, let alone running. In this case, lock_torture_cleanup() does no cleanup aside from invoking torture_cleanup_begin() and torture_cleanup_end(), as required to permit future torture tests to run. However, this heuristic fails if the run with bad parameters was preceded by a previous run that actually ran: In this case, both cxt.lwsa and cxt.lrsa will remain non-zero, which means that the current lock_torture_cleanup() invocation will be unable to detect the fact that it should skip cleanup, which can result in charming outcomes such as double frees. This commit therefore NULLs out both cxt.lwsa and cxt.lrsa at the end of any run that actually ran. Signed-off-by: Paul E. McKenney Cc: Davidlohr Bueso Cc: Josh Triplett --- kernel/locking/locktorture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ad40a2617063..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -829,7 +829,9 @@ static void lock_torture_cleanup(void) "End of test: SUCCESS"); kfree(cxt.lwsa); + cxt.lwsa = NULL; kfree(cxt.lrsa); + cxt.lrsa = NULL; end: torture_cleanup_end(); -- cgit v1.2.3 From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 27 Jan 2019 19:03:45 -0500 Subject: x86/ima: require signed kernel modules Have the IMA architecture specific policy require signed kernel modules on systems with secure boot mode enabled; and coordinate the different signature verification methods, so only one signature is required. Requiring appended kernel module signatures may be configured, enabled on the boot command line, or with this patch enabled in secure boot mode. This patch defines set_module_sig_enforced(). To coordinate between appended kernel module signatures and IMA signatures, only define an IMA MODULE_CHECK policy rule if CONFIG_MODULE_SIG is not enabled. A custom IMA policy may still define and require an IMA signature. Signed-off-by: Mimi Zohar Reviewed-by: Luis Chamberlain Acked-by: Jessica Yu --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..985caa467aef 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -286,6 +286,11 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); -- cgit v1.2.3 From 7a8e61f8478639072d402a26789055a4a4de8f77 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Mar 2019 11:36:19 +0100 Subject: timekeeping: Force upper bound for setting CLOCK_REALTIME Several people reported testing failures after setting CLOCK_REALTIME close to the limits of the kernel internal representation in nanoseconds, i.e. year 2262. The failures are exposed in subsequent operations, i.e. when arming timers or when the advancing CLOCK_MONOTONIC makes the calculation of CLOCK_REALTIME overflow into negative space. Now people start to paper over the underlying problem by clamping calculations to the valid range, but that's just wrong because such workarounds will prevent detection of real issues as well. It is reasonable to force an upper bound for the various methods of setting CLOCK_REALTIME. Year 2262 is the absolute upper bound. Assume a maximum uptime of 30 years which is plenty enough even for esoteric embedded systems. That results in an upper bound of year 2232 for setting the time. Once that limit is reached in reality this limit is only a small part of the problem space. But until then this stops people from trying to paper over the problem at the wrong places. Reported-by: Xiongfeng Wang Reported-by: Hongbo Yao Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: Miroslav Lichvar Cc: Arnd Bergmann Cc: Richard Cochran Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903231125480.2157@nanos.tec.linutronix.de --- kernel/time/time.c | 2 +- kernel/time/timekeeping.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..86656bbac232 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz static int firsttime = 1; int error = 0; - if (tv && !timespec64_valid(tv)) + if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 540145da33da..5716e28bfa3c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts) unsigned long flags; int ret = 0; - if (!timespec64_valid_strict(ts)) + if (!timespec64_valid_settod(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), *ts); if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_strict(&tmp)) { + !timespec64_valid_settod(&tmp)) { ret = -EINVAL; goto error; } @@ -1527,7 +1527,7 @@ void __init timekeeping_init(void) unsigned long flags; read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); - if (timespec64_valid_strict(&wall_time) && + if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { -- cgit v1.2.3 From 1c7651f43777cdd59c1aaa82c87324d3e7438c7b Mon Sep 17 00:00:00 2001 From: Eugene Loh Date: Mon, 25 Feb 2019 11:59:58 -0800 Subject: kallsyms: store type information in its own array When a module is loaded, its symbols' Elf_Sym information is stored in a symtab. Further, type information is also captured. Since Elf_Sym has no type field, historically the st_info field has been hijacked for storing type: st_info was overwritten. commit 5439c985c5a83a8419f762115afdf560ab72a452 ("module: Overwrite st_size instead of st_info") changes that practice, as its one-liner indicates. Unfortunately, this change overwrites symbol size, information that a tool like DTrace expects to find. Allocate a typetab array to store type information so that no Elf_Sym field needs to be overwritten. Fixes: 5439c985c5a8 ("module: Overwrite st_size instead of st_info") Signed-off-by: Eugene Loh Reviewed-by: Nick Alcock [jeyu: renamed typeoff -> typeoffs ] Signed-off-by: Jessica Yu --- kernel/module-internal.h | 2 +- kernel/module.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..d354341f8cc0 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -20,7 +20,7 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long symoffs, stroffs; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; struct _ddebug *debug; unsigned int num_debug; bool sig_ok; diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..69e52e82242a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2647,6 +2647,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ @@ -2660,6 +2662,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_layout.size; mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); mod->init_layout.size = debug_align(mod->init_layout.size); } @@ -2683,20 +2687,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_size - = elf_type(&mod->kallsyms->symtab[i], info); - - /* Now populate the cut down core kallsyms for after init. */ + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -4080,7 +4087,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, const Elf_Sym *sym = &kallsyms->symtab[symnum]; *value = kallsyms_symbol_value(sym); - *type = sym->st_size; + *type = kallsyms->typetab[symnum]; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); -- cgit v1.2.3 From aba0954327c831f593702e3a81ef3ad4bec7a838 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Mar 2019 11:28:52 +0100 Subject: tick/broadcast: Fix warning about undefined tick_broadcast_oneshot_offline() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Randconfig builds with CONFIG_TICK_ONESHOT=y CONFIG_HOTPLUG_CPU=n trigger kernel/time/tick-broadcast.c:39:13: warning: ‘tick_broadcast_oneshot_offline’ \ declared ‘static’ but never defined [-Wunused-function] due to that function's definition missing. Move the CONFIG_HOTPLUG_CPU ifdeffery around its declaration too. Fixes: 1b72d4323798 ("tick: Remove outgoing CPU from broadcast masks") Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Reviewed-by: Mukesh Ojha Cc: Valentin Schneider Cc: Frederic Weisbecker Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190329110508.6621-1-bp@alien8.de --- kernel/time/tick-broadcast.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0283523de045..7541cbca695e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,12 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif #endif /* -- cgit v1.2.3 From 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Mar 2019 18:01:57 -0700 Subject: bpf: Support variable offset stack access from helpers Currently there is a difference in how verifier checks memory access for helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to variable part of offset. check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable offsets just fine, so that BPF program can call a helper like this: some_helper(map_value_ptr + off, size); , where offset is unknown at load time, but is checked by program to be in a safe rage (off >= 0 && off + size < map_value_size). But it's not the case for check_stack_boundary, that is used for PTR_TO_STACK, and same code with pointer to stack is rejected by verifier: some_helper(stack_value_ptr + off, size); For example: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 6: (18) r1 = 0xffff888111343a80 8: (85) call bpf_map_lookup_elem#1 invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4) Add support for variable offset access to check_stack_boundary so that if offset is checked by program to be in a safe range it's accepted by verifier. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 75 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2fe89138309a..87221fda1321 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2157,6 +2157,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -2169,7 +2192,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int off, i, slot, spi; + int err, min_off, max_off, i, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2183,21 +2206,23 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + min_off = reg->smin_value + reg->off; + max_off = reg->umax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) + return err; } if (meta && meta->raw_mode) { @@ -2206,10 +2231,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } - for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; - slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -2222,8 +2247,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; } err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause @@ -2232,7 +2265,7 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, -- cgit v1.2.3 From b5dee3130bb4014511f5d0dd46855ed843e3fdc8 Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Mon, 25 Feb 2019 20:36:41 +0800 Subject: PM / sleep: Refactor filesystems sync to reduce duplication Create a common helper to sync filesystems for system suspend and hibernation. Signed-off-by: Harry Pan Acked-by: Pavel Machek [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 5 +---- kernel/power/main.c | 9 +++++++++ kernel/power/suspend.c | 13 +++++-------- kernel/power/user.c | 5 +---- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..cc105ecd9c07 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -709,9 +708,7 @@ int hibernate(void) goto Exit; } - pr_info("Syncing filesystems ... \n"); - ksys_sync(); - pr_info("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..40472a7c5536 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "power.h" @@ -51,6 +52,14 @@ void unlock_system_sleep(void) } EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ + pr_info("Syncing filesystems ... "); + ksys_sync(); + pr_cont("done.\n"); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..e39059dea38b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC - trace_suspend_resume(TPS("sync_filesystems"), 0, true); - pr_info("Syncing filesystems ... "); - ksys_sync(); - pr_cont("done.\n"); - trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif + if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + ksys_sync_helper(); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - printk("Syncing filesystems ... "); - ksys_sync(); - printk("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) -- cgit v1.2.3 From c64546b17bc940643545dd34eac21f51764d633c Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Mon, 25 Feb 2019 20:36:43 +0800 Subject: PM / sleep: Measure the time of filesystems syncing Measure the filesystems sync time during system sleep more precisely. Among other things, this allows the pr_cont() to be dropped from ksys_sync_helper() and makes automatic system suspend and hibernation profiling somewhat more straightforward. Signed-off-by: Harry Pan [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 40472a7c5536..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -54,9 +54,14 @@ EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { - pr_info("Syncing filesystems ... "); + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); ksys_sync(); - pr_cont("done.\n"); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); -- cgit v1.2.3 From de7b77e5bb9451417ca57f1b6501da654587c048 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Mar 2019 07:00:29 -0500 Subject: cpu/hotplug: Create SMT sysfs interface for all arches Make the /sys/devices/system/cpu/smt/* files available on all arches, so user space has a consistent way to detect whether SMT is enabled. The 'control' file now shows 'notimplemented' for architectures which don't yet have CONFIG_HOTPLUG_SMT. [ tglx: Make notimplemented a real state ] Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Waiman Long Cc: Peter Zijlstra Cc: Jiri Kosina Link: https://lkml.kernel.org/r/469c2b98055f2c41e75748e06447d592a64080c9.1553635520.git.jpoimboe@redhat.com --- kernel/cpu.c | 64 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6754f3ecfd94..b8bf3f93e39b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2033,19 +2033,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { - [CPU_SMT_ENABLED] = "on", - [CPU_SMT_DISABLED] = "off", - [CPU_SMT_FORCE_DISABLED] = "forceoff", - [CPU_SMT_NOT_SUPPORTED] = "notsupported", -}; - -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} - static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); @@ -2116,9 +2103,10 @@ static int cpuhp_smt_enable(void) return ret; } + static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { int ctrlval, ret; @@ -2156,14 +2144,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr, unlock_device_hotplug(); return ret ? ret : count; } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", + [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + const char *state = smt_states[cpu_smt_control]; + + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return __store_smt_control(dev, attr, buf, count); +} static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); static ssize_t show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) { - bool active = topology_max_smt_threads() > 1; - - return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); + return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2179,21 +2197,17 @@ static const struct attribute_group cpuhp_smt_attr_group = { NULL }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void) { return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif - static int __init cpuhp_sysfs_init(void) { int cpu, ret; - ret = cpu_smt_state_init(); + ret = cpu_smt_sysfs_init(); if (ret) return ret; @@ -2214,7 +2228,7 @@ static int __init cpuhp_sysfs_init(void) return 0; } device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that -- cgit v1.2.3 From 40ed29b373381532ef222e509c5aa69a1d8561ea Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 23 Sep 2018 12:11:33 +0000 Subject: ring-buffer: Fix ring buffer size in rb_write_something() 'cnt' should be used to calculate ring buffer size rather than data->cnt Link: http://lkml.kernel.org/r/1537704693-184237-1-git-send-email-yuehaibing@huawei.com Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4f33d7d841af 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ - size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); -- cgit v1.2.3 From f45d1225adb0479478cee989e2ae2d7d2c62b31b Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Mar 2019 11:28:51 -0700 Subject: tracing: Kernel access to Ftrace instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ftrace provides the feature “instances” that provides the capability to create multiple Ftrace ring buffers. However, currently these buffers are created/accessed via userspace only. The kernel APIs providing these features are not exported, hence cannot be used by other kernel components. This patch aims to extend this infrastructure to provide the flexibility to create/log/remove/ enable-disable existing trace events to these buffers from within the kernel. Link: http://lkml.kernel.org/r/1553106531-3281-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Joe Jin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 74 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace_events.c | 1 + 2 files changed, 51 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..4384fcc386c8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3053,6 +3053,7 @@ void trace_printk_init_buffers(void) if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { @@ -3213,6 +3214,7 @@ int trace_array_printk(struct trace_array *tr, va_end(ap); return ret; } +EXPORT_SYMBOL_GPL(trace_array_printk); __printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, @@ -8037,7 +8039,7 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -static int instance_mkdir(const char *name) +struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; @@ -8101,7 +8103,7 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return tr; out_free_tr: free_trace_buffers(tr); @@ -8113,33 +8115,21 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return ret; + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(trace_array_create); +static int instance_mkdir(const char *name) +{ + return PTR_ERR_OR_ZERO(trace_array_create(name)); } -static int instance_rmdir(const char *name) +static int __remove_instance(struct trace_array *tr) { - struct trace_array *tr; - int found = 0; - int ret; int i; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -ENODEV; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) { - found = 1; - break; - } - } - if (!found) - goto out_unlock; - - ret = -EBUSY; if (tr->ref || (tr->current_trace && tr->current_trace->ref)) - goto out_unlock; + return -EBUSY; list_del(&tr->list); @@ -8165,10 +8155,46 @@ static int instance_rmdir(const char *name) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); + tr = NULL; - ret = 0; + return 0; +} + +int trace_array_destroy(struct trace_array *tr) +{ + int ret; + + if (!tr) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = __remove_instance(tr); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + ret = __remove_instance(tr); + break; + } + } - out_unlock: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..81c038ed6cee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } +EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event -- cgit v1.2.3 From 8a062902be725f647dc8da532b04d836546a369a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:15 -0500 Subject: tracing: Add tracing error log Introduce a new ftrace file, tracing/error_log, for ftrace commands to log errors. This is useful for allowing more complex commands such as hist trigger and kprobe_event commands to point out specifically where something may have gone wrong without forcing them to resort to more ad hoc methods such as tacking error messages onto existing output files. To log a tracing error, call the event_log_err() function, passing it a location string describing where it came from e.g. kprobe_events or system:event, the command that caused the error, an array of static error strings describing errors and an index within that array which describes the specific error, along with the position to place the error caret. Reading the log displays the last (currently) 8 errors logged in the following format: [timestamp] : error: Command: ^ Memory for the error log isn't allocated unless there has been a trace event error, and the error log can be cleared and have its memory freed by writing the empty string in truncation mode to it: # echo > tracing/error_log. Link: http://lkml.kernel.org/r/0c2c82571fd38c5f3a88ca823627edff250e9416.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Suggested-by: Masami Hiramatsu Improvements-suggested-by: Steve Rostedt Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 4 + 2 files changed, 222 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4384fcc386c8..7978168f5041 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6878,6 +6878,221 @@ static const struct file_operations snapshot_raw_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ +}; + +static LIST_HEAD(tracing_err_log); +static DEFINE_MUTEX(tracing_err_log_lock); + +static unsigned int n_tracing_err_log_entries; + +struct tracing_log_err *get_tracing_log_err(void) +{ + struct tracing_log_err *err; + + if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + err = ERR_PTR(-ENOMEM); + n_tracing_err_log_entries++; + + return err; + } + + err = list_first_entry(&tracing_err_log, struct tracing_log_err, list); + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * : error: + * Command: + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos) +{ + struct tracing_log_err *err; + + mutex_lock(&tracing_err_log_lock); + err = get_tracing_log_err(); + if (PTR_ERR(err) == -ENOMEM) { + mutex_unlock(&tracing_err_log_lock); + return; + } + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tracing_err_log); + mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(void) +{ + struct tracing_log_err *err, *next; + + mutex_lock(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tracing_err_log, list) { + list_del(&err->list); + kfree(err); + } + + n_tracing_err_log_entries = 0; + mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tracing_err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tracing_err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +{ + u8 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &tracing_err_log_seq_ops); + + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = seq_lseek, +}; + static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -8284,6 +8499,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &snapshot_fops); #endif + trace_create_file("error_log", 0644, d_tracer, + tr, &tracing_err_log_fops); + for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..b711edbef7e7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1884,6 +1884,10 @@ extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char**)); +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats -- cgit v1.2.3 From a1a05bb40e229d148c071fcd2ed787b21f61ff8b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:16 -0500 Subject: tracing: Save the last hist command's associated event name In preparation for making use of the new trace error log, save the subsystem and event name associated with the last hist command - it will be passed as the location param in the event_log_err() calls. Link: http://lkml.kernel.org/r/eb0fd1362be8f39facb86c83eecf441b7a5876f8.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 795aa2038377..0de702bf148f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -535,15 +535,34 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_hist_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; + static char hist_err_str[MAX_FILTER_STR_VAL]; -static void last_cmd_set(char *str) +static void last_cmd_set(struct trace_event_file *file, char *str) { + const char *system = NULL, *name = NULL; + struct trace_event_call *call; + if (!str) return; - strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + + if (file) { + call = file->event_call; + + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } + } + + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } static void hist_err(char *str, char *var) @@ -583,6 +602,8 @@ static void hist_err_event(char *str, char *system, char *event, char *var) static void hist_err_clear(void) { hist_err_str[0] = '\0'; + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } static bool have_hist_err(void) @@ -5438,8 +5459,8 @@ static int hist_show(struct seq_file *m, void *v) } if (have_hist_err()) { - seq_printf(m, "\nERROR: %s\n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_hist_cmd); + seq_printf(m, "\n%s: error: \n", hist_err_str); + seq_printf(m, " Last command: %s\n", last_cmd); } out_unlock: @@ -6043,8 +6064,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); if (glob && strlen(glob)) { - last_cmd_set(param); hist_err_clear(); + last_cmd_set(file, param); } if (!param) -- cgit v1.2.3 From d566c5e9d1bad6773fe9cce3d4514cca2cc32e4e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:17 -0500 Subject: tracing: Use tracing error_log with hist triggers Replace hist_err() and hist_err_event() with tracing_log_err() from the new tracing error_log mechanism. Also add a couple related helper functions and remove most of the old hist_err()-related code. With this change, users no longer read the hist files for hist trigger error information, but instead look at tracing/error_log for the same information. Link: http://lkml.kernel.org/r/c98f77a97c9715d18b623eeb5741057b330d5ac0.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 206 ++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0de702bf148f..071c62cacba7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -22,6 +22,57 @@ #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define ERRORS \ + C(NONE, "No error"), \ + C(DUPLICATE_VAR, "Variable already defined"), \ + C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ + C(TOO_MANY_VARS, "Too many variables defined"), \ + C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ + C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ + C(TRIGGER_EEXIST, "Hist trigger already exists"), \ + C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ + C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ + C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ + C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ + C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ + C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ + C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ + C(HIST_NOT_FOUND, "Matching event histogram not found"), \ + C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ + C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ + C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ + C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ + C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ + C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ + C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ + C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ + C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ + C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ + C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ + C(TOO_MANY_PARAMS, "Too many action params"), \ + C(PARAM_NOT_FOUND, "Couldn't find param"), \ + C(INVALID_PARAM, "Invalid action param"), \ + C(ACTION_NOT_FOUND, "No action found"), \ + C(NO_SAVE_PARAMS, "No params found for save()"), \ + C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ + C(ACTION_MISMATCH, "Handler doesn't support action"), \ + C(NO_CLOSING_PAREN, "No closing paren found"), \ + C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ + C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ + C(INVALID_REF_KEY, "Using variable references as keys not supported"), \ + C(VAR_NOT_FOUND, "Couldn't find variable"), \ + C(FIELD_NOT_FOUND, "Couldn't find field"), + +#undef C +#define C(a, b) HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -538,7 +589,10 @@ static struct track_data *track_data_alloc(unsigned int key_len, static char last_cmd[MAX_FILTER_STR_VAL]; static char last_cmd_loc[MAX_FILTER_STR_VAL]; -static char hist_err_str[MAX_FILTER_STR_VAL]; +static int errpos(char *str) +{ + return err_pos(last_cmd, str); +} static void last_cmd_set(struct trace_event_file *file, char *str) { @@ -565,55 +619,17 @@ static void last_cmd_set(struct trace_event_file *file, char *str) snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err(char *str, char *var) +static void hist_err(u8 err_type, u8 err_pos) { - int maxlen = MAX_FILTER_STR_VAL - 1; - - if (!str) - return; - - if (strlen(hist_err_str)) - return; - - if (!var) - var = ""; - - if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) - return; - - strcat(hist_err_str, str); - strcat(hist_err_str, var); -} - -static void hist_err_event(char *str, char *system, char *event, char *var) -{ - char err[MAX_FILTER_STR_VAL]; - - if (system && var) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); - else if (system) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); - else - strscpy(err, var, MAX_FILTER_STR_VAL); - - hist_err(str, err); + tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); } static void hist_err_clear(void) { - hist_err_str[0] = '\0'; last_cmd[0] = '\0'; last_cmd_loc[0] = '\0'; } -static bool have_hist_err(void) -{ - if (strlen(hist_err_str)) - return true; - - return false; -} - struct synth_trace_event { struct trace_entry ent; u64 fields[]; @@ -1740,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1791,7 +1807,7 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -2023,7 +2039,6 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) attrs->n_actions++; ret = 0; } - return ret; } @@ -2083,7 +2098,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", str); + hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -2681,8 +2696,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err_event("Couldn't find variable: $", - system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2716,7 +2730,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err("Invalid field modifier: ", modifier); + hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2732,7 +2746,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err("Couldn't find field: ", field_name); + hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2843,7 +2857,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2926,7 +2940,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err("Timestamp units in expression don't match", NULL); + hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2944,7 +2958,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -3182,16 +3196,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("trace action: Too many field variables defined: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("trace action: Event file not found: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3204,8 +3216,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("trace action: Matching event histogram not found: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3266,8 +3277,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't create histogram for field: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3279,8 +3289,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't find synthetic variable: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3417,21 +3426,21 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err("Too many field variables defined: ", field_name); + hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err("Couldn't parse field variable: ", field_name); + hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err("Couldn't create or find variable: ", field_name); + hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3763,14 +3772,14 @@ static int track_data_create(struct hist_trigger_data *hist_data, track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); + hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); + hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3783,7 +3792,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONMAX) track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onmax variable: ", "__max"); + hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3791,7 +3800,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONCHANGE) track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onchange variable: ", "__change"); + hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3810,20 +3819,20 @@ static int parse_action_params(char *params, struct action_data *data) while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err("Too many action params", ""); + hist_err(HIST_ERR_TOO_MANY_PARAMS, 0); goto out; } param = strsep(¶ms, ","); if (!param) { - hist_err("No action param found", ""); + hist_err(HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err("Invalid action param: ", param); + hist_err(HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3855,14 +3864,14 @@ static int action_parse(char *str, struct action_data *data, strsep(&str, "."); if (!str) { - hist_err("action parsing: No action found", ""); + hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } action_name = strsep(&str, "("); if (!action_name || !str) { - hist_err("action parsing: No action found", ""); + hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } @@ -3871,7 +3880,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!params) { - hist_err("action parsing: No params found for %s", "save"); + hist_err(HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; goto out; } @@ -3885,7 +3894,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3897,7 +3906,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!str) { - hist_err("action parsing: No closing paren found: %s", params); + hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params)); ret = -EINVAL; goto out; } @@ -3907,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -4060,7 +4069,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, } if (!hist_field) - hist_err_event("trace action: Couldn't find param: $", system, event, var); + hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } @@ -4135,7 +4144,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event = find_synth_event(synth_event_name); if (!event) { - hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); + hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } @@ -4196,15 +4205,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("trace action: Param type doesn't match synthetic event field type: ", - system, event_name, param); + hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); + hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } @@ -4250,7 +4258,7 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; - hist_err("save action: Can't have more than one save() action per hist", ""); + hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); goto out; } @@ -4263,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { - hist_err("save action: Couldn't create field variable: ", param); + hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param)); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -4297,19 +4305,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err("onmatch: Missing closing paren: ", match_event); + hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err_event("onmatch: Invalid subsystem or event name: ", - match_event_system, match_event, NULL); + hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } @@ -4400,7 +4407,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err("Variable already defined: ", var_name); + hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4481,7 +4488,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err("Using variable references as keys not supported: ", field_str); + hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4595,13 +4602,13 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err("Malformed assignment: ", var_name); + hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", var_name); + hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -5458,11 +5465,6 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } - if (have_hist_err()) { - seq_printf(m, "\n%s: error: \n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_cmd); - } - out_unlock: mutex_unlock(&event_mutex); @@ -5834,7 +5836,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); + hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5855,7 +5857,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err("Hist trigger already exists", NULL); + hist_err(HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5863,7 +5865,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err("Can't clear or continue a nonexistent hist trigger", NULL); + hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5888,7 +5890,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err("Couldn't set trace_clock: ", clock); + hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } -- cgit v1.2.3 From 34f76afaac7a437a2ce381225135563928b359dd Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:18 -0500 Subject: tracing: Use tracing error_log with trace event filters Use tracing_log_err() from the new tracing error_log mechanism to send filter parse errors to tracing/error_log. With this change, users will be able to see filter errors by looking at tracing/error_log. The same errors will also be available in the filter file, as expected. Link: http://lkml.kernel.org/r/1d942c419941539a11d78a6810fc5740a99b2974.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 05a66493a164..290d42c59101 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -66,7 +66,8 @@ static const char * ops[] = { OPS }; C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ - C(NO_FILTER, "No filter found"), + C(ERRNO, "Error"), \ + C(NO_FILTER, "No filter found") #undef C #define C(a, b) FILT_ERR_##a @@ -76,7 +77,7 @@ enum { ERRORS }; #undef C #define C(a, b) b -static char *err_text[] = { ERRORS }; +static const char *err_text[] = { ERRORS }; /* Called after a '!' character but "!=" and "!~" are not "not"s */ static bool is_not(const char *str) @@ -947,8 +948,14 @@ static void append_filter_err(struct filter_parse_error *pe, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + tracing_log_err("event filter parse error", + filter->filter_string, err_text, + pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + tracing_log_err("event filter parse error", + filter->filter_string, err_text, + FILT_ERR_ERRNO, 0); } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); -- cgit v1.2.3 From ab105a4fb89496c71c5a0f3222347c506c30feb0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 31 Mar 2019 18:48:19 -0500 Subject: tracing: Use tracing error_log with probe events Use tracing error_log with probe events for logging error more precisely. This also makes all parse error returns -EINVAL (except for -ENOMEM), because user can see better error message in error_log file now. Link: http://lkml.kernel.org/r/6a4d90e141d138040ea61f4776b991597077451e.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 77 ++++++++----- kernel/trace/trace_probe.c | 274 +++++++++++++++++++++++++++++++------------- kernel/trace/trace_probe.h | 77 ++++++++++++- kernel/trace/trace_uprobe.c | 44 ++++--- 4 files changed, 348 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5d5129b05df7..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) { + if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } return ret; } @@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk; + struct trace_kprobe *tk = NULL; int i, len, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; @@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; + trace_probe_log_init("trace_kprobe", argc, argv); + event = strchr(&argv[0][1], ':'); if (event) event++; if (isdigit(argv[0][1])) { if (!is_return) { - pr_info("Maxactive is not for kprobe"); - return -EINVAL; + trace_probe_log_err(1, MAXACT_NO_KPROBE); + goto parse_error; } if (event) len = event - &argv[0][1] - 1; else len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) - return -E2BIG; + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { - pr_info("Invalid maxactive number\n"); - return ret; + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { - pr_info("Maxactive is too big (%d > %d).\n", - maxactive, KRETPROBE_MAXACTIVE_MAX); - return -E2BIG; + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; } } /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) - return -ECANCELED; + if (strchr(argv[1], '/') && strchr(argv[1], ':')) { + ret = -ECANCELED; + goto error; + } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { - pr_info("Failed to parse either an address or a symbol.\n"); - goto out; + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { - pr_info("Given offset is not valid for return probe.\n"); - ret = -EINVAL; - goto out; + trace_probe_log_err(0, BAD_RETPROBE); + goto parse_error; } } - argc -= 2; argv += 2; + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) - goto out; + goto parse_error; } else { /* Make a new event name */ if (symbol) @@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); - /* This must return -ENOMEM otherwise there is a bug */ + /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; + goto out; /* We know tk is not allocated */ } + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { @@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); kfree(tmp); if (ret) - goto error; + goto error; /* This can be -ENOMEM */ } ret = register_trace_kprobe(tk); - if (ret) + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM) + trace_probe_log_err(0, FAIL_REG_PROBE); goto error; + } + out: + trace_probe_log_clear(); kfree(symbol); return ret; +parse_error: + ret = -EINVAL; error: free_trace_kprobe(tk); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f8411e7835f..e11f98c49d72 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,6 +13,11 @@ #include "trace_probe.h" +#undef C +#define C(a, b) b + +static const char *trace_probe_err_text[] = { ERRORS }; + static const char *reserved_field_names[] = { "common_type", "common_flags", @@ -133,6 +138,60 @@ fail: return NULL; } +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ + trace_probe_log.subsystem = subsystem; + trace_probe_log.argc = argc; + trace_probe_log.argv = argv; + trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ + trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ + char *command, *p; + int i, len = 0, pos = 0; + + if (!trace_probe_log.argv) + return; + + /* Recalcurate the length and allocate buffer */ + for (i = 0; i < trace_probe_log.argc; i++) { + if (i == trace_probe_log.index) + pos = len; + len += strlen(trace_probe_log.argv[i]) + 1; + } + command = kzalloc(len, GFP_KERNEL); + if (!command) + return; + + /* And make a command string from argv array */ + p = command; + for (i = 0; i < trace_probe_log.argc; i++) { + len = strlen(trace_probe_log.argv[i]); + strcpy(p, trace_probe_log.argv[i]); + p[len] = ' '; + p += len + 1; + } + *(p - 1) = '\0'; + + tracing_log_err(trace_probe_log.subsystem, command, + trace_probe_err_text, err_type, pos + offset); + + kfree(command); +} + /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) /* @buf must has MAX_EVENT_NAME_LEN size */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, - char *buf) + char *buf, int offset) { const char *slash, *event = *pevent; int len; @@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, slash = strchr(event, '/'); if (slash) { if (slash == event) { - pr_info("Group name is not specified\n"); + trace_probe_log_err(offset, NO_GROUP_NAME); return -EINVAL; } if (slash - event + 1 > MAX_EVENT_NAME_LEN) { - pr_info("Group name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, GROUP_TOO_LONG); + return -EINVAL; } strlcpy(buf, event, slash - event + 1); if (!is_good_name(buf)) { - pr_info("Group name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } *pgroup = buf; *pevent = slash + 1; + offset += slash - event + 1; event = *pevent; } len = strlen(event); if (len == 0) { - pr_info("Event name is not specified\n"); + trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { - pr_info("Event name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, EVENT_TOO_LONG); + return -EINVAL; } if (!is_good_name(event)) { - pr_info("Event name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_EVENT_NAME); return -EINVAL; } return 0; @@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags) + struct fetch_insn *code, unsigned int flags, int offs) { unsigned long param; int ret = 0; int len; if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) + if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - else + } else { + trace_probe_log_err(offs, RETVAL_ON_PROBE); ret = -EINVAL; + } } else if ((len = str_has_prefix(arg, "stack"))) { if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret || ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK)) + if (ret) { + goto inval_var; + } else if ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_STACK_NUM); ret = -EINVAL; - else { + } else { code->op = FETCH_OP_STACK; code->param = (unsigned int)param; } } else - ret = -EINVAL; + goto inval_var; } else if (strcmp(arg, "comm") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && (len = str_has_prefix(arg, "arg"))) { - if (!isdigit(arg[len])) - return -EINVAL; ret = kstrtoul(arg + len, 10, ¶m); - if (ret || !param || param > PARAM_MAX_STACK) + if (ret) { + goto inval_var; + } else if (!param || param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_ARG_NUM); return -EINVAL; + } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif } else - ret = -EINVAL; + goto inval_var; return ret; + +inval_var: + trace_probe_log_err(offs, BAD_VAR); + return -EINVAL; } /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags) + unsigned int flags, int offs) { struct fetch_insn *code = *pcode; unsigned long param; @@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags); + ret = parse_probe_vars(arg + 1, type, code, flags, offs); break; case '%': /* named register */ @@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_REG; code->param = (unsigned int)ret; ret = 0; - } + } else + trace_probe_log_err(offs, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_MEM_ADDR); break; + } /* load address */ code->op = FETCH_OP_IMM; code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) + if (flags & TPARG_FL_KERNEL) { + trace_probe_log_err(offs, FILE_ON_KPROBE); return -EINVAL; - + } ret = kstrtol(arg + 2, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_FILE_OFFS); break; + } code->op = FETCH_OP_FOFFS; code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) + if (!(flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(offs, SYM_ON_UPROBE); return -EINVAL; - + } /* Preserve symbol for updating */ code->op = FETCH_NOP_SYMBOL; code->data = kstrdup(arg + 1, GFP_KERNEL); if (!code->data) return -ENOMEM; - if (++code == end) - return -E2BIG; - + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } code->op = FETCH_OP_IMM; code->immediate = 0; } /* These are fetching from memory */ - if (++code == end) - return -E2BIG; + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } *pcode = code; code->op = FETCH_OP_DEREF; code->offset = offset; @@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, /* fall through */ case '-': tmp = strchr(arg, '('); - if (!tmp) + if (!tmp) { + trace_probe_log_err(offs, DEREF_NEED_BRACE); return -EINVAL; - + } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_DEREF_OFFS); break; - + } + offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); - - if (tmp) { + if (!tmp) { + trace_probe_log_err(offs + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } else { const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags); + ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) + if (code->op == FETCH_OP_COMM) { + trace_probe_log_err(offs, COMM_CANT_DEREF); + return -EINVAL; + } + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); return -EINVAL; - if (++code == end) - return -E2BIG; + } *pcode = code; code->op = FETCH_OP_DEREF; @@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ + trace_probe_log_err(offs, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf, return -EINVAL; code++; if (code->op != FETCH_OP_NOP) - return -E2BIG; + return -EINVAL; *pcode = code; code->op = FETCH_OP_MOD_BF; @@ -392,32 +484,53 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags) + struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2; + char *t, *t2, *t3; int ret, len; - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; + len = strlen(arg); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(offset, ARG_TOO_LONG); + return -EINVAL; + } else if (len == 0) { + trace_probe_log_err(offset, NO_ARG_BODY); + return -EINVAL; } + parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); + if (!parg->comm) return -ENOMEM; - } + t = strchr(arg, ':'); if (t) { *t = '\0'; t2 = strchr(++t, '['); if (t2) { - *t2 = '\0'; - parg->count = simple_strtoul(t2 + 1, &t2, 0); - if (strcmp(t2, "]") || parg->count == 0) + *t2++ = '\0'; + t3 = strchr(t2, ']'); + if (!t3) { + offset += t2 + strlen(t2) - arg; + trace_probe_log_err(offset, + ARRAY_NO_CLOSE); + return -EINVAL; + } else if (t3[1] != '\0') { + trace_probe_log_err(offset + t3 + 1 - arg, + BAD_ARRAY_SUFFIX); + return -EINVAL; + } + *t3 = '\0'; + if (kstrtouint(t2, 0, &parg->count) || !parg->count) { + trace_probe_log_err(offset + t2 - arg, + BAD_ARRAY_NUM); return -EINVAL; - if (parg->count > MAX_ARRAY_LEN) - return -E2BIG; + } + if (parg->count > MAX_ARRAY_LEN) { + trace_probe_log_err(offset + t2 - arg, + ARRAY_TOO_BIG); + return -EINVAL; + } } } /* @@ -429,7 +542,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, else parg->type = find_fetch_type(t); if (!parg->type) { - pr_info("Unsupported type: %s\n", t); + trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); return -EINVAL; } parg->offset = *size; @@ -450,7 +563,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags); + flags, offset); if (ret) goto fail; @@ -458,7 +571,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { - pr_info("string only accepts memory or address.\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } @@ -470,7 +584,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } } @@ -483,7 +598,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -493,20 +609,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Modify operation */ if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); - if (ret) + if (ret) { + trace_probe_log_err(offset + t - arg, BAD_BITFIELD); goto fail; + } } /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING) { - pr_info("array only accepts memory or address\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -555,15 +675,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, { struct probe_arg *parg = &tp->args[i]; char *body; - int ret; /* Increment count for freeing args in error case */ tp->nr_args++; body = strchr(arg, '='); if (body) { - if (body - arg > MAX_ARG_NAME_LEN || body == arg) + if (body - arg > MAX_ARG_NAME_LEN) { + trace_probe_log_err(0, ARG_NAME_TOO_LONG); + return -EINVAL; + } else if (body == arg) { + trace_probe_log_err(0, NO_ARG_NAME); return -EINVAL; + } parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { @@ -575,22 +699,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, return -ENOMEM; if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); + trace_probe_log_err(0, BAD_ARG_NAME); return -EINVAL; } - if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { - pr_info("Argument[%d]: '%s' conflicts with another field.\n", - i, parg->name); + trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); - if (ret) - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); - return ret; + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, + body - arg); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2177c206de15..b7737666c1a8 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -280,8 +280,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int traceprobe_parse_event_name(const char **pevent, - const char **pgroup, char *buf); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); @@ -298,3 +298,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS \ + C(FILE_NOT_FOUND, "Failed to find the given file"), \ + C(NO_REGULAR_FILE, "Not a regular file"), \ + C(BAD_REFCNT, "Invalid reference counter offset"), \ + C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ + C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ + C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ + C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT, "Invalid maxactive number"), \ + C(MAXACT_TOO_BIG, "Maxactive is too big"), \ + C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_GROUP_NAME, "Group name is not specified"), \ + C(GROUP_TOO_LONG, "Group name is too long"), \ + C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ + C(NO_EVENT_NAME, "Event name is not specified"), \ + C(EVENT_TOO_LONG, "Event name is too long"), \ + C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(BAD_STACK_NUM, "Invalid stack number"), \ + C(BAD_ARG_NUM, "Invalid argument number"), \ + C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_REG_NAME, "Invalid register name"), \ + C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ + C(BAD_FILE_OFFS, "Invalid file offset value"), \ + C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ + C(TOO_MANY_OPS, "Dereference is too much nested"), \ + C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ + C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ + C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ + C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ + C(BAD_FETCH_ARG, "Invalid fetch argument"), \ + C(ARRAY_NO_CLOSE, "Array is not closed"), \ + C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ + C(BAD_ARRAY_NUM, "Invalid array size"), \ + C(ARRAY_TOO_BIG, "Array number is too big"), \ + C(BAD_TYPE, "Unknown type is specified"), \ + C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_BITFIELD, "Invalid bitfield"), \ + C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ + C(NO_ARG_NAME, "Argument name is not specified"), \ + C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ + C(USED_ARG_NAME, "This argument name is already used"), \ + C(ARG_TOO_LONG, "Argument expression is too long"), \ + C(NO_ARG_BODY, "No argument expression"), \ + C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ + C(FAIL_REG_PROBE, "Failed to register probe event"), + +#undef C +#define C(a, b) TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { + const char *subsystem; + const char **argv; + int argc; + int index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err) \ + __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index be78d99ee6bc..cd8750a72768 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -457,13 +457,19 @@ static int trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } + trace_probe_log_init("trace_uprobe", argc, argv); + trace_probe_log_set_index(1); /* filename is the 2nd argument */ + *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { + trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); + trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { + trace_probe_log_err(0, NO_REGULAR_FILE); ret = -EINVAL; goto fail_address_parse; } @@ -472,9 +478,16 @@ static int trace_uprobe_create(int argc, const char **argv) rctr = strchr(arg, '('); if (rctr) { rctr_end = strchr(rctr, ')'); - if (rctr > rctr_end || *(rctr_end + 1) != 0) { + if (!rctr_end) { + ret = -EINVAL; + rctr_end = rctr + strlen(rctr); + trace_probe_log_err(rctr_end - filename, + REFCNT_OPEN_BRACE); + goto fail_address_parse; + } else if (rctr_end[1] != '\0') { ret = -EINVAL; - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr_end + 1 - filename, + BAD_REFCNT_SUFFIX); goto fail_address_parse; } @@ -482,22 +495,23 @@ static int trace_uprobe_create(int argc, const char **argv) *rctr_end = '\0'; ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr - filename, BAD_REFCNT); goto fail_address_parse; } } /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); goto fail_address_parse; - - argc -= 2; - argv += 2; + } /* setup a probe */ + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) goto fail_address_parse; } else { @@ -519,6 +533,9 @@ static int trace_uprobe_create(int argc, const char **argv) kfree(tail); } + argc -= 2; + argv += 2; + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { ret = PTR_ERR(tu); @@ -539,6 +556,7 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); kfree(tmp); @@ -547,20 +565,20 @@ static int trace_uprobe_create(int argc, const char **argv) } ret = register_trace_uprobe(tu); - if (ret) - goto error; - return 0; + if (!ret) + goto out; error: free_trace_uprobe(tu); +out: + trace_probe_log_clear(); return ret; fail_address_parse: + trace_probe_log_clear(); path_put(&path); kfree(filename); - pr_info("Failed to parse address or file.\n"); - return ret; } -- cgit v1.2.3 From d18bf4229b1772e91c0c36772737c01cf9726720 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 12 Mar 2019 04:06:37 -0400 Subject: perf/core: Make perf_swevent_init_cpu() static 'make W=1' causes GCC to complain: kernel/events/core.c:11877:6: warning: no previous prototype for 'perf_swevent_init_cpu' [-Wmissing-prototypes] It's not referenced anywhere else, make it static. Signed-off-by: Valdis Kletnieks Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/28974.1552377997@turing-police Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 72d06e302e99..dfc4bab0b02b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11878,7 +11878,7 @@ static void __init perf_event_init_all_cpus(void) } } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); -- cgit v1.2.3 From 6455959819bf2469190ae9f6b4ccebaa9827e884 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Feb 2019 14:38:37 +0100 Subject: ia64/tlb: Eradicate tlb_migrate_finish() callback Only ia64-sn2 uses this as an optimization, and there it is of questionable correctness due to the mm_users==1 test. Remove it entirely. No change in behavior intended. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4778c48a7fda..ade3f2287d1f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1151,7 +1151,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); return 0; } else if (task_on_rq_queued(p)) { /* -- cgit v1.2.3 From 4a6c91fbdef846ec7250b82f2eeeb87ac5f18cf9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Mar 2019 11:09:13 +0100 Subject: x86/uaccess, ftrace: Fix ftrace_likely_update() vs. SMAP For CONFIG_TRACE_BRANCH_PROFILING=y the likely/unlikely things get overloaded and generate callouts to this code, and thus also when AC=1. Make it safe. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant) { + unsigned long flags = user_access_save(); + /* A constant is always correct */ if (is_constant) { f->constant++; @@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, f->data.correct++; else f->data.incorrect++; + + user_access_restore(flags); } EXPORT_SYMBOL(ftrace_likely_update); -- cgit v1.2.3 From 40ea97290b08be2e038b31cbb33097d1145e8169 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Mar 2019 19:54:25 +0100 Subject: x86/uaccess, kcov: Disable stack protector New tooling noticed this mishap: kernel/kcov.o: warning: objtool: write_comp_data()+0x138: call to __stack_chk_fail() with UACCESS enabled kernel/kcov.o: warning: objtool: __sanitizer_cov_trace_pc()+0xd9: call to __stack_chk_fail() with UACCESS enabled All the other instrumentation (KASAN,UBSAN) also have stack protector disabled. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..62471e75a2b0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) -- cgit v1.2.3 From a1247d06d01045d7ab2882a9c074fbf21137c690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Mar 2019 13:18:56 +0100 Subject: locking/static_key: Fix false positive warnings on concurrent dec/inc Even though the atomic_dec_and_mutex_lock() in __static_key_slow_dec_cpuslocked() can never see a negative value in key->enabled the subsequent sanity check is re-reading key->enabled, which may have been set to -1 in the meantime by static_key_slow_inc_cpuslocked(). CPU A CPU B __static_key_slow_dec_cpuslocked(): static_key_slow_inc_cpuslocked(): # enabled = 1 atomic_dec_and_mutex_lock() # enabled = 0 atomic_read() == 0 atomic_set(-1) # enabled = -1 val = atomic_read() # Oops - val == -1! The test case is TCP's clean_acked_data_enable() / clean_acked_data_disable() as tickled by KTLS (net/ktls). Suggested-by: Jakub Kicinski Reported-by: Jakub Kicinski Tested-by: Jakub Kicinski Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: ard.biesheuvel@linaro.org Cc: oss-drivers@netronome.com Cc: pbonzini@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..a799b1ac6b2f 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -206,6 +206,8 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + int val; + lockdep_assert_cpus_held(); /* @@ -215,17 +217,20 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, * returns is unbalanced, because all other static_key_slow_inc() * instances block while the update is in progress. */ - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { - WARN(atomic_read(&key->enabled) < 0, - "jump label: negative count!\n"); + val = atomic_fetch_add_unless(&key->enabled, -1, 1); + if (val != 1) { + WARN(val < 0, "jump label: negative count!\n"); return; } - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else { - jump_label_update(key); + jump_label_lock(); + if (atomic_dec_and_test(&key->enabled)) { + if (rate_limit) { + atomic_inc(&key->enabled); + schedule_delayed_work(work, rate_limit); + } else { + jump_label_update(key); + } } jump_label_unlock(); } -- cgit v1.2.3 From b10abd0a8859493a93c6b8020f2be2587557749d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 20 Mar 2019 20:34:23 -0400 Subject: sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu Recently I added an RCU annotation check to rcu_assign_pointer(). All pointers assigned to RCU protected data are to be annotated with __rcu inorder to be able to use rcu_assign_pointer() similar to checks in other RCU APIs. This resulted in a sparse error: kernel//sched/cpufreq.c:41:9: sparse: error: incompatible types in comparison expression (different address spaces) Fix this by annotating cpufreq_update_util_data pointer with __rcu. This will also help sparse catch any future RCU misuage bugs. Signed-off-by: Joel Fernandes (Google) [ From an RCU perspective. ] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Luc Van Oostenryck Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: keescook@chromium.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Link: https://lkml.kernel.org/r/20190321003426.160260-2-joel@joelfernandes.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq.c | 2 +- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 835671f0f917..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@ */ #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efa686eeff26..713715dd00cf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu) #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. -- cgit v1.2.3 From 994aeb7a93e43d28f6074195ccb03a384342e1bf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 20 Mar 2019 20:34:24 -0400 Subject: sched_domain: Annotate RCU pointers properly The scheduler uses RCU API in various places to access sched_domain pointers. These cause sparse errors as below. Many new errors show up because of an annotation check I added to rcu_assign_pointer(). Let us annotate the pointers correctly which also will help sparse catch any potential future bugs. This fixes the following sparse errors: rt.c:1681:9: error: incompatible types in comparison expression deadline.c:1904:9: error: incompatible types in comparison expression core.c:519:9: error: incompatible types in comparison expression core.c:1634:17: error: incompatible types in comparison expression fair.c:6193:14: error: incompatible types in comparison expression fair.c:9883:22: error: incompatible types in comparison expression fair.c:9897:9: error: incompatible types in comparison expression sched.h:1287:9: error: incompatible types in comparison expression topology.c:612:9: error: incompatible types in comparison expression topology.c:615:9: error: incompatible types in comparison expression sched.h:1300:9: error: incompatible types in comparison expression topology.c:618:9: error: incompatible types in comparison expression sched.h:1287:9: error: incompatible types in comparison expression topology.c:621:9: error: incompatible types in comparison expression sched.h:1300:9: error: incompatible types in comparison expression topology.c:624:9: error: incompatible types in comparison expression topology.c:671:9: error: incompatible types in comparison expression stats.c:45:17: error: incompatible types in comparison expression fair.c:5998:15: error: incompatible types in comparison expression fair.c:5989:15: error: incompatible types in comparison expression fair.c:5998:15: error: incompatible types in comparison expression fair.c:5989:15: error: incompatible types in comparison expression fair.c:6120:19: error: incompatible types in comparison expression fair.c:6506:14: error: incompatible types in comparison expression fair.c:6515:14: error: incompatible types in comparison expression fair.c:6623:9: error: incompatible types in comparison expression fair.c:5970:17: error: incompatible types in comparison expression fair.c:8642:21: error: incompatible types in comparison expression fair.c:9253:9: error: incompatible types in comparison expression fair.c:9331:9: error: incompatible types in comparison expression fair.c:9519:15: error: incompatible types in comparison expression fair.c:9533:14: error: incompatible types in comparison expression fair.c:9542:14: error: incompatible types in comparison expression fair.c:9567:14: error: incompatible types in comparison expression fair.c:9597:14: error: incompatible types in comparison expression fair.c:9421:16: error: incompatible types in comparison expression fair.c:9421:16: error: incompatible types in comparison expression Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) [ From an RCU perspective. ] Reviewed-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Luc Van Oostenryck Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: keescook@chromium.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Link: https://lkml.kernel.org/r/20190321003426.160260-3-joel@joelfernandes.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 14 +++++++------- kernel/sched/topology.c | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 713715dd00cf..2b452d68ab2e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -869,8 +869,8 @@ struct rq { atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain __rcu *sd; unsigned long cpu_capacity; unsigned long cpu_capacity_orig; @@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) return sd; } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ab7f371a3a17..64bec54ded3e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) * the cpumask of the domain), this allows us to quickly tell if * two CPUs are in the same cache domain, see cpus_share_cache(). */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) -- cgit v1.2.3 From 7ba7319f9e3898101bff5d63cbae5a6cc174c8c9 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 20 Mar 2019 20:34:26 -0400 Subject: sched/core: Annotate perf_domain pointer with __rcu This fixes the following sparse errors in sched/fair.c: fair.c:6506:14: error: incompatible types in comparison expression fair.c:8642:21: error: incompatible types in comparison expression Using __rcu will also help sparse catch any future bugs. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) [ From an RCU perspective. ] Reviewed-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Luc Van Oostenryck Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: keescook@chromium.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Link: https://lkml.kernel.org/r/20190321003426.160260-5-joel@joelfernandes.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b452d68ab2e..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -780,7 +780,7 @@ struct root_domain { * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. */ - struct perf_domain *pd; + struct perf_domain __rcu *pd; }; extern struct root_domain def_root_domain; -- cgit v1.2.3 From 71b47eaf6fb29b7f9722dc1646c26eb8a96e0a6d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 21:38:39 +0800 Subject: sched/fair: Make sync_entity_load_avg() and remove_entity_load_avg() static Fix these sparse warnigs: kernel/sched/fair.c:3570:6: warning: symbol 'sync_entity_load_avg' was not declared. Should it be static? kernel/sched/fair.c:3583:6: warning: symbol 'remove_entity_load_avg' was not declared. Should it be static? Signed-off-by: YueHaibing Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190320133839.21392-1-yuehaibing@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40bd1e27b1b7..ed7f5f8107b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3567,7 +3567,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; @@ -3580,7 +3580,7 @@ void sync_entity_load_avg(struct sched_entity *se) * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; -- cgit v1.2.3 From 46ad0840b1584b92b5ff2cc3ed0b011dd6b8e0f1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:06 -0400 Subject: locking/rwsem: Remove arch specific rwsem files As the generic rwsem-xadd code is using the appropriate acquire and release versions of the atomic operations, the arch specific rwsem.h files will not be that much faster than the generic code as long as the atomic functions are properly implemented. So we can remove those arch specific rwsem.h and stop building asm/rwsem.h to reduce maintenance effort. Currently, only x86, alpha and ia64 have implemented architecture specific fast paths. I don't have access to alpha and ia64 systems for testing, but they are legacy systems that are not likely to be updated to the latest kernel anyway. By using a rwsem microbenchmark, the total locking rates on a 4-socket 56-core 112-thread x86-64 system before and after the patch were as follows (mixed means equal # of read and write locks): Before Patch After Patch # of Threads wlock rlock mixed wlock rlock mixed ------------ ----- ----- ----- ----- ----- ----- 1 29,201 30,143 29,458 28,615 30,172 29,201 2 6,807 13,299 1,171 7,725 15,025 1,804 4 6,504 12,755 1,520 7,127 14,286 1,345 8 6,762 13,412 764 6,826 13,652 726 16 6,693 15,408 662 6,599 15,938 626 32 6,145 15,286 496 5,549 15,487 511 64 5,812 15,495 60 5,858 15,572 60 There were some run-to-run variations for the multi-thread tests. For x86-64, using the generic C code fast path seems to be a little bit faster than the assembly version with low lock contention. Looking at the assembly version of the fast paths, there are assembly to/from C code wrappers that save and restore all the callee-clobbered registers (7 registers on x86-64). The assembly generated from the generic C code doesn't need to do that. That may explain the slight performance gain here. The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h with no code change as no other code other than those under kernel/locking needs to access the internal rwsem macros and functions. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/percpu-rwsem.c | 2 + kernel/locking/rwsem.h | 130 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@ #include #include +#include "rwsem.h" + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..067e265fa5c1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -32,6 +32,26 @@ # define DEBUG_RWSEMS_WARN_ON(c) #endif +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras . + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +152,113 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } #endif + +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) + rwsem_down_read_failed(sem); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + } + + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + long tmp; + + while ((tmp = atomic_long_read(&sem->count)) >= 0) { + if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + return 1; + } + } + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + return tmp == RWSEM_UNLOCKED_VALUE; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_dec_return_release(&sem->count); + if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count) < 0)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} + +#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */ -- cgit v1.2.3 From 390a0c62c23cb026cd4664a66f6f45fed3a215f6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:07 -0400 Subject: locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all archs Currently, we have two different implementation of rwsem: 1) CONFIG_RWSEM_GENERIC_SPINLOCK (rwsem-spinlock.c) 2) CONFIG_RWSEM_XCHGADD_ALGORITHM (rwsem-xadd.c) As we are going to use a single generic implementation for rwsem-xadd.c and no architecture-specific code will be needed, there is no point in keeping two different implementations of rwsem. In most cases, the performance of rwsem-spinlock.c will be worse. It also doesn't get all the performance tuning and optimizations that had been implemented in rwsem-xadd.c over the years. For simplication, we are going to remove rwsem-spinlock.c and make all architectures use a single implementation of rwsem - rwsem-xadd.c. All references to RWSEM_GENERIC_SPINLOCK and RWSEM_XCHGADD_ALGORITHM in the code are removed. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-3-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 2 +- kernel/locking/Makefile | 4 +- kernel/locking/rwsem-spinlock.c | 339 ---------------------------------------- kernel/locking/rwsem.h | 3 - 4 files changed, 2 insertions(+), 346 deletions(-) delete mode 100644 kernel/locking/rwsem-spinlock.c (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..e335953fa704 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER config RWSEM_SPIN_ON_OWNER def_bool y - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config LOCK_SPIN_ON_OWNER def_bool y diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..1af83e9ce57d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,6 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli - * - Derived also from comments by Linus - */ -#include -#include -#include -#include - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: - return 0; - -out_nolock: - /* - * We didn't take the lock, so that there is a writer, which - * is owner or the first waiter of the sem. If it's a waiter, - * it will be woken by current owner. Not need to wake anybody. - */ - list_del(&waiter.list); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ - __down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ - return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 067e265fa5c1..45ee00236e03 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -153,7 +153,6 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM /* * lock for reading */ @@ -260,5 +259,3 @@ static inline void __downgrade_write(struct rw_semaphore *sem) if (tmp < 0) rwsem_downgrade_wake(sem); } - -#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */ -- cgit v1.2.3 From ddb20d1d3aed8f130519c0a29cd5392efcc067b8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:08 -0400 Subject: locking/rwsem: Optimize down_read_trylock() Modify __down_read_trylock() to optimize for an unlocked rwsem and make it generate slightly better code. Before this patch, down_read_trylock: 0x0000000000000000 <+0>: callq 0x5 0x0000000000000005 <+5>: jmp 0x18 0x0000000000000007 <+7>: lea 0x1(%rdx),%rcx 0x000000000000000b <+11>: mov %rdx,%rax 0x000000000000000e <+14>: lock cmpxchg %rcx,(%rdi) 0x0000000000000013 <+19>: cmp %rax,%rdx 0x0000000000000016 <+22>: je 0x23 0x0000000000000018 <+24>: mov (%rdi),%rdx 0x000000000000001b <+27>: test %rdx,%rdx 0x000000000000001e <+30>: jns 0x7 0x0000000000000020 <+32>: xor %eax,%eax 0x0000000000000022 <+34>: retq 0x0000000000000023 <+35>: mov %gs:0x0,%rax 0x000000000000002c <+44>: or $0x3,%rax 0x0000000000000030 <+48>: mov %rax,0x20(%rdi) 0x0000000000000034 <+52>: mov $0x1,%eax 0x0000000000000039 <+57>: retq After patch, down_read_trylock: 0x0000000000000000 <+0>: callq 0x5 0x0000000000000005 <+5>: xor %eax,%eax 0x0000000000000007 <+7>: lea 0x1(%rax),%rdx 0x000000000000000b <+11>: lock cmpxchg %rdx,(%rdi) 0x0000000000000010 <+16>: jne 0x29 0x0000000000000012 <+18>: mov %gs:0x0,%rax 0x000000000000001b <+27>: or $0x3,%rax 0x000000000000001f <+31>: mov %rax,0x20(%rdi) 0x0000000000000023 <+35>: mov $0x1,%eax 0x0000000000000028 <+40>: retq 0x0000000000000029 <+41>: test %rax,%rax 0x000000000000002c <+44>: jns 0x7 0x000000000000002e <+46>: xor %eax,%eax 0x0000000000000030 <+48>: retq By using a rwsem microbenchmark, the down_read_trylock() rate (with a load of 10 to lengthen the lock critical section) on a x86-64 system before and after the patch were: Before Patch After Patch # of Threads rlock rlock ------------ ----- ----- 1 14,496 14,716 2 8,644 8,453 4 6,799 6,983 8 5,664 7,190 On a ARM64 system, the performance results were: Before Patch After Patch # of Threads rlock rlock ------------ ----- ----- 1 23,676 24,488 2 7,697 9,502 4 4,945 3,440 8 2,641 1,603 For the uncontended case (1 thread), the new down_read_trylock() is a little bit faster. For the contended cases, the new down_read_trylock() perform pretty well in x86-64, but performance degrades at high contention level on ARM64. Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-4-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 45ee00236e03..1f5775aa6a1d 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -174,14 +174,17 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { - long tmp; + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; - while ((tmp = atomic_long_read(&sem->count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { return 1; } - } + } while (tmp >= 0); return 0; } -- cgit v1.2.3 From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:40 -0700 Subject: bpf: add verifier stats and log_level bit 2 In order to understand the verifier bottlenecks add various stats and extend log_level: log_level 1 and 2 are kept as-is: bit 0 - level=1 - print every insn and verifier state at branch points bit 1 - level=2 - print every insn and verifier state at every insn bit 2 - level=4 - print verifier error and stats at the end of verification When verifier rejects the program the libbpf is trying to load the program twice. Once with log_level=0 (no messages, only error code is reported to user space) and second time with log_level=1 to tell the user why the verifier rejected it. With introduction of bit 2 - level=4 the libbpf can choose to always use that level and load programs once, since the verification speed is not affected and in case of error the verbose message will be available. Note that the verifier stats are not part of uapi just like all other verbose messages. They're expected to change in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87221fda1321..e2001c1e40b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1092,7 +1092,7 @@ static int check_subprogs(struct bpf_verifier_env *env) */ subprog[env->subprog_cnt].start = insn_cnt; - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1139,6 +1139,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ + int cnt = 0; while (parent) { /* if read wasn't screened by an earlier write ... */ @@ -1155,7 +1156,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, state = parent; parent = state->parent; writes = true; + cnt++; } + + if (env->longest_mark_read_walk < cnt) + env->longest_mark_read_walk = cnt; return 0; } @@ -1455,7 +1460,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, state); /* The minimum value is only important with signed @@ -2938,7 +2943,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = target_insn; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, caller); verbose(env, "callee:\n"); @@ -2978,7 +2983,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; *insn_idx = callee->callsite + 1; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee); verbose(env, "to caller at %d:\n", *insn_idx); @@ -5001,7 +5006,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->dst_reg); return -EACCES; } - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -6181,6 +6186,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; } + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; @@ -6194,6 +6202,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; + env->total_states++; + env->peak_states++; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6278,8 +6288,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; - int insn_processed = 0; + int insn_cnt = env->prog->len; bool do_print_state = false; env->prev_linfo = NULL; @@ -6314,10 +6323,10 @@ static int do_check(struct bpf_verifier_env *env) insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { + if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, "BPF program is too large. Processed %d insn\n", - insn_processed); + env->insn_processed); return -E2BIG; } @@ -6326,7 +6335,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { if (do_print_state) verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -6344,8 +6353,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || (env->log.level && do_print_state)) { - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2 || + (env->log.level & BPF_LOG_LEVEL && do_print_state)) { + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d%s:", @@ -6356,7 +6366,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, .private_data = env, @@ -6621,16 +6631,6 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns (limit %d), stack depth ", - insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -7854,9 +7854,34 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } +static void print_verification_stats(struct bpf_verifier_env *env) +{ + int i; + + if (env->log.level & BPF_LOG_STATS) { + verbose(env, "verification time %lld usec\n", + div_u64(env->verification_time, 1000)); + verbose(env, "stack depth "); + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt) + verbose(env, "+"); + } + verbose(env, "\n"); + } + verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " + "total_states %d peak_states %d mark_read %d\n", + env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, + env->max_states_per_insn, env->total_states, + env->peak_states, env->longest_mark_read_walk); +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { + u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; struct bpf_verifier_log *log; int i, len, ret = -EINVAL; @@ -7899,7 +7924,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } @@ -7980,6 +8005,9 @@ skip_full_check: if (ret == 0) ret = fixup_call_args(env); + env->verification_time = ktime_get_ns() - start_time; + print_verification_stats(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { -- cgit v1.2.3 From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:41 -0700 Subject: bpf: improve verification speed by droping states Branch instructions, branch targets and calls in a bpf program are the places where the verifier remembers states that led to successful verification of the program. These states are used to prune brute force program analysis. For unprivileged programs there is a limit of 64 states per such 'branching' instructions (maximum length is tracked by max_states_per_insn counter introduced in the previous patch). Simply reducing this threshold to 32 or lower increases insn_processed metric to the point that small valid programs get rejected. For root programs there is no limit and cilium programs can have max_states_per_insn to be 100 or higher. Walking 100+ states multiplied by number of 'branching' insns during verification consumes significant amount of cpu time. Turned out simple LRU-like mechanism can be used to remove states that unlikely will be helpful in future search pruning. This patch introduces hit_cnt and miss_cnt counters: hit_cnt - this many times this state successfully pruned the search miss_cnt - this many times this state was not equivalent to other states (and that other states were added to state list) The heuristic introduced in this patch is: if (sl->miss_cnt > sl->hit_cnt * 3 + 3) /* drop this state from future considerations */ Higher numbers increase max_states_per_insn (allow more states to be considered for pruning) and slow verification speed, but do not meaningfully reduce insn_processed metric. Lower numbers drop too many states and insn_processed increases too much. Many different formulas were considered. This one is simple and works well enough in practice. (the analysis was done on selftests/progs/* and on cilium programs) The end result is this heuristic improves verification speed by 10 times. Large synthetic programs that used to take a second more now take 1/10 of a second. In cases where max_states_per_insn used to be 100 or more, now it's ~10. There is a slight increase in insn_processed for cilium progs: before after bpf_lb-DLB_L3.o 1831 1838 bpf_lb-DLB_L4.o 3029 3218 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26309 26935 bpf_lxc-DUNKNOWN.o 33517 34439 bpf_netdev.o 9713 9721 bpf_overlay.o 6184 6184 bpf_lcx_jit.o 37335 39389 And 2-3 times improvement in the verification speed. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e2001c1e40b3..a636db4a7a4e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6152,11 +6152,13 @@ static int propagate_liveness(struct bpf_verifier_env *env, static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; + struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - sl = env->explored_states[insn_idx]; + pprev = &env->explored_states[insn_idx]; + sl = *pprev; + if (!sl) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6167,6 +6169,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { + sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -6182,8 +6185,35 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl = sl->next; states_cnt++; + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + */ + if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + *pprev = sl->next; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + free_verifier_state(&sl->state, false); + kfree(sl); + env->peak_states--; + } else { + /* cannot free this state, since parentage chain may + * walk it later. Add it for free_list instead to + * be freed at the end of verification + */ + sl->next = env->free_list; + env->free_list = sl; + } + sl = *pprev; + continue; + } + pprev = &sl->next; + sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -7836,6 +7866,14 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_verifier_state_list *sl, *sln; int i; + sl = env->free_list; + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } + if (!env->explored_states) return; -- cgit v1.2.3 From 25af32dad8047d180e70e233c85b909dd6587cc5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:42 -0700 Subject: bpf: improve verification speed by not remarking live_read With large verifier speed improvement brought by the previous patch mark_reg_read() becomes the hottest function during verification. On a typical program it consumes 40% of cpu. mark_reg_read() walks parentage chain of registers to mark parents as LIVE_READ. Once the register is marked there is no need to remark it again in the future. Hence stop walking the chain once first LIVE_READ is seen. This optimization drops mark_reg_read() time from 40% of cpu to <1% and overall 2x improvement of verification speed. For some programs the longest_mark_read_walk counter improves from ~500 to ~5 Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Reviewed-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a636db4a7a4e..94cf6efc5df6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1151,6 +1151,15 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } + if (parent->live & REG_LIVE_READ) + /* The parentage chain never changes and + * this parent was already marked as LIVE_READ. + * There is no need to keep walking the chain again and + * keep re-marking all parents as LIVE_READ. + * This case happens when the same register is read + * multiple times without writes into it in-between. + */ + break; /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; -- cgit v1.2.3 From 71dde681a8cea1ccff2c7b3be83c043ab6b2a977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:43 -0700 Subject: bpf: convert temp arrays to kvcalloc Temporary arrays used during program verification need to be vmalloc-ed to support large bpf programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94cf6efc5df6..ad3494a881da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5313,13 +5313,13 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { - kfree(insn_state); + kvfree(insn_state); return -ENOMEM; } @@ -5417,8 +5417,8 @@ check_state: ret = 0; /* cfg looks good */ err_free: - kfree(insn_state); - kfree(insn_stack); + kvfree(insn_state); + kvfree(insn_stack); return ret; } @@ -7898,7 +7898,7 @@ static void free_states(struct bpf_verifier_env *env) } } - kfree(env->explored_states); + kvfree(env->explored_states); } static void print_verification_stats(struct bpf_verifier_env *env) @@ -7994,7 +7994,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kcalloc(env->prog->len, + env->explored_states = kvcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; -- cgit v1.2.3 From 4f73379ec5c2891598aa715c6df7ac9afdc86fbf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:44 -0700 Subject: bpf: verbose jump offset overflow check Larger programs may trigger 16-bit jump offset overflow check during instruction patching. Make this error verbose otherwise users cannot decipher error code without printks in the verifier. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 11 ++++++----- kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..2966cb368bf4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -438,6 +438,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; + int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { @@ -453,8 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, off + 1, off + len, true)) - return NULL; + (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) + return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as @@ -463,7 +464,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) - return NULL; + return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; @@ -1096,13 +1097,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (!tmp) { + if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return ERR_PTR(-ENOMEM); + return tmp; } clone = tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ad3494a881da..6dcfeb44bb8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6932,8 +6932,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_prog *new_prog; new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (!new_prog) + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); return NULL; + } if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; adjust_subprog_starts(env, off, len); -- cgit v1.2.3 From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:45 -0700 Subject: bpf: increase complexity limit and maximum program size Large verifier speed improvements allow to increase verifier complexity limit. Now regardless of the program composition and its size it takes little time for the verifier to hit insn_processed limit. On typical x86 machine non-debug kernel processes 1M instructions in 1/10 of a second. (before these speed improvements specially crafted programs could be hitting multi-second verification times) Full kasan kernel with debug takes ~1 second for the same 1M insns. Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M. Also increase the number of instructions per program from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit. 4k limit was confusing to users, since small programs with hundreds of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit. Sometimes adding more insns and bpf_trace_printk debug statements would make the verifier accept the program while removing code would make the verifier reject it. Some user space application started to add #define MAX_FOO to their programs and do: MAX_FOO=100; again: compile with MAX_FOO; try to load; if (fails_to_load) { reduce MAX_FOO; goto again; } to be able to fit maximum amount of processing into single program. Other users artificially split their single program into a set of programs and use all 32 iterations of tail_calls to increase compute limits. And the most advanced folks used unlimited tc-bpf filter list to execute many bpf programs. Essentially the users managed to workaround 4k insn limit. This patch removes the limit for root programs from uapi. BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit and success to load the program no longer depends on program size, but on 'smartness' of the verifier only. The verifier will continue to get smarter with every kernel release. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index afca36f53c49..1d65e56594db 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1557,7 +1557,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + if (attr->insn_cnt == 0 || + attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6dcfeb44bb8e..b631e89e7a51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_COMPLEXITY_LIMIT_STATES 64 -- cgit v1.2.3 From 7a9f5c65abcc9644b11738ca0815510cb5510eaf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:46 -0700 Subject: bpf: increase verifier log limit The existing 16Mbyte verifier log limit is not enough for log_level=2 even for small programs. Increase it to 1Gbyte. Note it's not a kernel memory limit. It's an amount of memory user space provides to store the verifier log. The kernel populates it 1k at a time. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b631e89e7a51..bb27b675923c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7974,7 +7974,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } -- cgit v1.2.3 From d6e486ee0ef2f99a4069d9186e53dac61b28cb3c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 3 Apr 2019 16:03:54 -0700 Subject: cgroup: remove extra cgroup_migrate_finish() call The callers of cgroup_migrate_prepare_dst() correctly call cgroup_migrate_finish() for success and failure cases both. No need to call it in cgroup_migrate_prepare_dst() in failure case. Signed-off-by: Shakeel Butt Reviewed-by: Daniel Jordan Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..f219c195a9a5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2602,7 +2602,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2634,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** -- cgit v1.2.3 From 9419a3191dcb27f24478d288abaab697228d28e6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Apr 2019 21:04:13 -0400 Subject: acct_on(): don't mess with freeze protection What happens there is that we are replacing file->path.mnt of a file we'd just opened with a clone and we need the write count contribution to be transferred from original mount to new one. That's it. We do *NOT* want any kind of freeze protection for the duration of switchover. IOW, we should just use __mnt_{want,drop}_write() for that switchover; no need to bother with mnt_{want,drop}_write() there. Tested-by: Amir Goldstein Reported-by: syzbot+2a73a6ea9507b7112141@syzkaller.appspotmail.com Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = mnt_want_write(internal); + err = __mnt_want_write(internal); if (err) { mntput(internal); kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); mntput(mnt); return 0; } -- cgit v1.2.3 From f2bcd05ec7b839ff826d2008506ad2d2dff46a59 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:37 -0700 Subject: bpf: Reject indirect var_off stack access in raw mode It's hard to guarantee that whole memory is marked as initialized on helper return if uninitialized stack is accessed with variable offset since specific bounds are unknown to verifier. This may cause uninitialized stack leaking. Reject such an access in check_stack_boundary to prevent possible leaking. There are no known use-cases for indirect uninitialized stack access with variable offset so it shouldn't break anything. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb27b675923c..0f12fda35626 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2226,6 +2226,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + min_off = reg->smin_value + reg->off; max_off = reg->umax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, -- cgit v1.2.3 From 088ec26d9c2da9d879ab73e3f4117f9df6c566ee Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:39 -0700 Subject: bpf: Reject indirect var_off stack access in unpriv mode Proper support of indirect stack access with variable offset in unprivileged mode (!root) requires corresponding support in Spectre masking for stack ALU in retrieve_ptr_limit(). There are no use-case for variable offset in unprivileged mode though so make verifier reject such accesses for simplicity. Pointer arithmetics is one (and only?) way to cause variable offset and it's already rejected in unpriv mode so that verifier won't even get to helper function whose argument contains variable offset, e.g.: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Still it looks like a good idea to reject variable offset indirect stack access for unprivileged mode in check_stack_boundary() explicitly. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0f12fda35626..8400c1f33cd4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2226,6 +2226,19 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } /* Only initialized buffer on stack is allowed to be accessed * with variable offset. With uninitialized buffer it's hard to * guarantee that whole memory is marked as initialized on @@ -3339,6 +3352,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, switch (ptr_reg->type) { case PTR_TO_STACK: + /* Indirect variable offset stack access is prohibited in + * unprivileged mode so it's not handled here. + */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) *ptr_limit = MAX_BPF_STACK + off; -- cgit v1.2.3 From 107c26a70ca81bfc33657366ad69d02fdc9efc9d Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:41 -0700 Subject: bpf: Sanity check max value for var_off stack access As discussed in [1] max value of variable offset has to be checked for overflow on stack access otherwise verifier would accept code like this: 0: (b7) r2 = 6 1: (b7) r3 = 28 2: (7a) *(u64 *)(r10 -16) = 0 3: (7a) *(u64 *)(r10 -8) = 0 4: (79) r4 = *(u64 *)(r1 +168) 5: (c5) if r4 s< 0x0 goto pc+4 R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28 R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm 6: (17) r4 -= 16 7: (0f) r4 += r10 8: (b7) r5 = 8 9: (85) call bpf_getsockopt#57 10: (b7) r0 = 0 11: (95) exit , where R4 obviosly has unbounded max value. Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF; BPF_MAX_VAR_OFF) range. reg->smax_value is used instead of reg->umax_value because stack pointers are calculated using negative offset from fp. This is opposite to e.g. map access where offset must be non-negative and where umax_value is used. Also dedicated verbose logs are added for both min and max bound check failures to have diagnostics consistent with variable offset handling in check_map_access(). [1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2 Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8400c1f33cd4..f2d600199e66 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2248,16 +2248,28 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } min_off = reg->smin_value + reg->off; - max_off = reg->umax_value + reg->off; + max_off = reg->smax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); return err; + } err = __check_stack_boundary(env, regno, max_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); return err; + } } if (meta && meta->raw_mode) { -- cgit v1.2.3 From 1fbd20f8b77b366ea4aeb92ade72daa7f36a7e3b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:43 -0700 Subject: bpf: Add missed newline in verifier verbose log check_stack_access() that prints verbose log is used in adjust_ptr_min_max_vals() that prints its own verbose log and now they stick together, e.g.: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Add missing newline so that log is more readable: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1 R2 stack pointer arithmetic goes out of range, prohibited for !root Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f2d600199e66..48718e1da16d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1426,7 +1426,7 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } -- cgit v1.2.3 From bfe83844987a52dc1f71f757b60523811502dc93 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 28 Mar 2019 16:13:35 +0100 Subject: genirq/timings: Remove variance computation code The variance computation did not provide the expected results and will be replaced with a different approach to compute the next interrupt based on the array suffixes derived algorithm. There is no good way to transform the variance code to the new algorithm, so for ease of review remove the existing code first. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: rjw@rjwysocki.net Cc: ulf.hansson@linaro.org Cc: linux-pm@vger.kernel.org Link: https://lkml.kernel.org/r/20190328151336.5316-1-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 252 +-------------------------------------------------- 1 file changed, 2 insertions(+), 250 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..3cde046a2bc8 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -8,7 +8,6 @@ #include #include #include -#include #include @@ -19,13 +18,7 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); struct irqt_stat { - u64 next_evt; - u64 last_ts; - u64 variance; - u32 avg; - u32 nr_samples; - int anomalies; - int valid; + u64 next_evt; }; static DEFINE_IDR(irqt_stats); @@ -40,184 +33,6 @@ void irq_timings_disable(void) static_branch_disable(&irq_timing_enabled); } -/** - * irqs_update - update the irq timing statistics with a new timestamp - * - * @irqs: an irqt_stat struct pointer - * @ts: the new timestamp - * - * The statistics are computed online, in other words, the code is - * designed to compute the statistics on a stream of values rather - * than doing multiple passes on the values to compute the average, - * then the variance. The integer division introduces a loss of - * precision but with an acceptable error margin regarding the results - * we would have with the double floating precision: we are dealing - * with nanosec, so big numbers, consequently the mantisse is - * negligeable, especially when converting the time in usec - * afterwards. - * - * The computation happens at idle time. When the CPU is not idle, the - * interrupts' timestamps are stored in the circular buffer, when the - * CPU goes idle and this routine is called, all the buffer's values - * are injected in the statistical model continuying to extend the - * statistics from the previous busy-idle cycle. - * - * The observations showed a device will trigger a burst of periodic - * interrupts followed by one or two peaks of longer time, for - * instance when a SD card device flushes its cache, then the periodic - * intervals occur again. A one second inactivity period resets the - * stats, that gives us the certitude the statistical values won't - * exceed 1x10^9, thus the computation won't overflow. - * - * Basically, the purpose of the algorithm is to watch the periodic - * interrupts and eliminate the peaks. - * - * An interrupt is considered periodically stable if the interval of - * its occurences follow the normal distribution, thus the values - * comply with: - * - * avg - 3 x stddev < value < avg + 3 x stddev - * - * Which can be simplified to: - * - * -3 x stddev < value - avg < 3 x stddev - * - * abs(value - avg) < 3 x stddev - * - * In order to save a costly square root computation, we use the - * variance. For the record, stddev = sqrt(variance). The equation - * above becomes: - * - * abs(value - avg) < 3 x sqrt(variance) - * - * And finally we square it: - * - * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 - * - * (value - avg) x (value - avg) < 9 x variance - * - * Statistically speaking, any values out of this interval is - * considered as an anomaly and is discarded. However, a normal - * distribution appears when the number of samples is 30 (it is the - * rule of thumb in statistics, cf. "30 samples" on Internet). When - * there are three consecutive anomalies, the statistics are resetted. - * - */ -static void irqs_update(struct irqt_stat *irqs, u64 ts) -{ - u64 old_ts = irqs->last_ts; - u64 variance = 0; - u64 interval; - s64 diff; - - /* - * The timestamps are absolute time values, we need to compute - * the timing interval between two interrupts. - */ - irqs->last_ts = ts; - - /* - * The interval type is u64 in order to deal with the same - * type in our computation, that prevent mindfuck issues with - * overflow, sign and division. - */ - interval = ts - old_ts; - - /* - * The interrupt triggered more than one second apart, that - * ends the sequence as predictible for our purpose. In this - * case, assume we have the beginning of a sequence and the - * timestamp is the first value. As it is impossible to - * predict anything at this point, return. - * - * Note the first timestamp of the sequence will always fall - * in this test because the old_ts is zero. That is what we - * want as we need another timestamp to compute an interval. - */ - if (interval >= NSEC_PER_SEC) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; - return; - } - - /* - * Pre-compute the delta with the average as the result is - * used several times in this function. - */ - diff = interval - irqs->avg; - - /* - * Increment the number of samples. - */ - irqs->nr_samples++; - - /* - * Online variance divided by the number of elements if there - * is more than one sample. Normally the formula is division - * by nr_samples - 1 but we assume the number of element will be - * more than 32 and dividing by 32 instead of 31 is enough - * precise. - */ - if (likely(irqs->nr_samples > 1)) - variance = irqs->variance >> IRQ_TIMINGS_SHIFT; - - /* - * The rule of thumb in statistics for the normal distribution - * is having at least 30 samples in order to have the model to - * apply. Values outside the interval are considered as an - * anomaly. - */ - if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { - /* - * After three consecutive anomalies, we reset the - * stats as it is no longer stable enough. - */ - if (irqs->anomalies++ >= 3) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; - return; - } - } else { - /* - * The anomalies must be consecutives, so at this - * point, we reset the anomalies counter. - */ - irqs->anomalies = 0; - } - - /* - * The interrupt is considered stable enough to try to predict - * the next event on it. - */ - irqs->valid = 1; - - /* - * Online average algorithm: - * - * new_average = average + ((value - average) / count) - * - * The variance computation depends on the new average - * to be computed here first. - * - */ - irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); - - /* - * Online variance algorithm: - * - * new_variance = variance + (value - average) x (value - new_average) - * - * Warning: irqs->avg is updated with the line above, hence - * 'interval - irqs->avg' is no longer equal to 'diff' - */ - irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); - - /* - * Update the next event - */ - irqs->next_evt = ts + irqs->avg; -} - /** * irq_timings_next_event - Return when the next event is supposed to arrive * @@ -246,12 +61,6 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts) */ u64 irq_timings_next_event(u64 now) { - struct irq_timings *irqts = this_cpu_ptr(&irq_timings); - struct irqt_stat *irqs; - struct irqt_stat __percpu *s; - u64 ts, next_evt = U64_MAX; - int i, irq = 0; - /* * This function must be called with the local irq disabled in * order to prevent the timings circular buffer to be updated @@ -259,64 +68,7 @@ u64 irq_timings_next_event(u64 now) */ lockdep_assert_irqs_disabled(); - /* - * Number of elements in the circular buffer: If it happens it - * was flushed before, then the number of elements could be - * smaller than IRQ_TIMINGS_SIZE, so the count is used, - * otherwise the array size is used as we wrapped. The index - * begins from zero when we did not wrap. That could be done - * in a nicer way with the proper circular array structure - * type but with the cost of extra computation in the - * interrupt handler hot path. We choose efficiency. - * - * Inject measured irq/timestamp to the statistical model - * while decrementing the counter because we consume the data - * from our circular buffer. - */ - for (i = irqts->count & IRQ_TIMINGS_MASK, - irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); - irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { - - irq = irq_timing_decode(irqts->values[i], &ts); - - s = idr_find(&irqt_stats, irq); - if (s) { - irqs = this_cpu_ptr(s); - irqs_update(irqs, ts); - } - } - - /* - * Look in the list of interrupts' statistics, the earliest - * next event. - */ - idr_for_each_entry(&irqt_stats, s, i) { - - irqs = this_cpu_ptr(s); - - if (!irqs->valid) - continue; - - if (irqs->next_evt <= now) { - irq = i; - next_evt = now; - - /* - * This interrupt mustn't use in the future - * until new events occur and update the - * statistics. - */ - irqs->valid = 0; - break; - } - - if (irqs->next_evt < next_evt) { - irq = i; - next_evt = irqs->next_evt; - } - } - - return next_evt; + return 0; } void irq_timings_free(int irq) -- cgit v1.2.3 From bbba0e7c5cdadb47a91edea1d5cd0caadbbb016f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 28 Mar 2019 16:13:36 +0100 Subject: genirq/timings: Add array suffix computation code The previous approach based on the variance was discarding values from the timings when they were considered as anomalies as stated by the normal law statistical model. However in the interrupt life, there can be multiple anomalies due to the nature of the device generating the interrupts, and most of the time a repeating pattern can be observed, that is particulary true for network, console, MMC or SSD devices. The variance approach missed the patterns and it was only able to deal with the interrupt coming in regular intervals, thus reducing considerably the scope of what is predictable. In order to find out the repeating patterns, the interrupt intervals are grouped in a ilog2 basis to create a suite of numbers with small amplitude. Every group contains an exponential moving average of the values belonging to the group. The array suffix, a data structure used for string searching, data compression, etc ..., is built from the suite of numbers and the suffixes are then searched in this suite. The tests showed the algorithm is able to find all repeating patterns, as well as regular interval in less than 1us on x86-i7. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: rjw@rjwysocki.net Cc: ulf.hansson@linaro.org Cc: linux-pm@vger.kernel.org Link: https://lkml.kernel.org/r/20190328151336.5316-2-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 462 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 457 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 3cde046a2bc8..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include @@ -17,10 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); -struct irqt_stat { - u64 next_evt; -}; - static DEFINE_IDR(irqt_stats); void irq_timings_enable(void) @@ -33,6 +31,410 @@ void irq_timings_disable(void) static_branch_disable(&irq_timing_enabled); } +/* + * The main goal of this algorithm is to predict the next interrupt + * occurrence on the current CPU. + * + * Currently, the interrupt timings are stored in a circular array + * buffer every time there is an interrupt, as a tuple: the interrupt + * number and the associated timestamp when the event occurred . + * + * For every interrupt occurring in a short period of time, we can + * measure the elapsed time between the occurrences for the same + * interrupt and we end up with a suite of intervals. The experience + * showed the interrupts are often coming following a periodic + * pattern. + * + * The objective of the algorithm is to find out this periodic pattern + * in a fastest way and use its period to predict the next irq event. + * + * When the next interrupt event is requested, we are in the situation + * where the interrupts are disabled and the circular buffer + * containing the timings is filled with the events which happened + * after the previous next-interrupt-event request. + * + * At this point, we read the circular buffer and we fill the irq + * related statistics structure. After this step, the circular array + * containing the timings is empty because all the values are + * dispatched in their corresponding buffers. + * + * Now for each interrupt, we can predict the next event by using the + * suffix array, log interval and exponential moving average + * + * 1. Suffix array + * + * Suffix array is an array of all the suffixes of a string. It is + * widely used as a data structure for compression, text search, ... + * For instance for the word 'banana', the suffixes will be: 'banana' + * 'anana' 'nana' 'ana' 'na' 'a' + * + * Usually, the suffix array is sorted but for our purpose it is + * not necessary and won't provide any improvement in the context of + * the solved problem where we clearly define the boundaries of the + * search by a max period and min period. + * + * The suffix array will build a suite of intervals of different + * length and will look for the repetition of each suite. If the suite + * is repeating then we have the period because it is the length of + * the suite whatever its position in the buffer. + * + * 2. Log interval + * + * We saw the irq timings allow to compute the interval of the + * occurrences for a specific interrupt. We can reasonibly assume the + * longer is the interval, the higher is the error for the next event + * and we can consider storing those interval values into an array + * where each slot in the array correspond to an interval at the power + * of 2 of the index. For example, index 12 will contain values + * between 2^11 and 2^12. + * + * At the end we have an array of values where at each index defines a + * [2^index - 1, 2 ^ index] interval values allowing to store a large + * number of values inside a small array. + * + * For example, if we have the value 1123, then we store it at + * ilog2(1123) = 10 index value. + * + * Storing those value at the specific index is done by computing an + * exponential moving average for this specific slot. For instance, + * for values 1800, 1123, 1453, ... fall under the same slot (10) and + * the exponential moving average is computed every time a new value + * is stored at this slot. + * + * 3. Exponential Moving Average + * + * The EMA is largely used to track a signal for stocks or as a low + * pass filter. The magic of the formula, is it is very simple and the + * reactivity of the average can be tuned with the factors called + * alpha. + * + * The higher the alphas are, the faster the average respond to the + * signal change. In our case, if a slot in the array is a big + * interval, we can have numbers with a big difference between + * them. The impact of those differences in the average computation + * can be tuned by changing the alpha value. + * + * + * -- The algorithm -- + * + * We saw the different processing above, now let's see how they are + * used together. + * + * For each interrupt: + * For each interval: + * Compute the index = ilog2(interval) + * Compute a new_ema(buffer[index], interval) + * Store the index in a circular buffer + * + * Compute the suffix array of the indexes + * + * For each suffix: + * If the suffix is reverse-found 3 times + * Return suffix + * + * Return Not found + * + * However we can not have endless suffix array to be build, it won't + * make sense and it will add an extra overhead, so we can restrict + * this to a maximum suffix length of 5 and a minimum suffix length of + * 2. The experience showed 5 is the majority of the maximum pattern + * period found for different devices. + * + * The result is a pattern finding less than 1us for an interrupt. + * + * Example based on real values: + * + * Example 1 : MMC write/read interrupt interval: + * + * 223947, 1240, 1384, 1386, 1386, + * 217416, 1236, 1384, 1386, 1387, + * 214719, 1241, 1386, 1387, 1384, + * 213696, 1234, 1384, 1386, 1388, + * 219904, 1240, 1385, 1389, 1385, + * 212240, 1240, 1386, 1386, 1386, + * 214415, 1236, 1384, 1386, 1387, + * 214276, 1234, 1384, 1388, ? + * + * For each element, apply ilog2(value) + * + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? + * + * Max period of 5, we take the last (max_period * 3) 15 elements as + * we can be confident if the pattern repeats itself three times it is + * a repeating pattern. + * + * 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? + * + * Suffixes are: + * + * 1) 8, 15, 8, 8, 8 <- max period + * 2) 8, 15, 8, 8 + * 3) 8, 15, 8 + * 4) 8, 15 <- min period + * + * From there we search the repeating pattern for each suffix. + * + * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 + * | | | | | | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | + * 8, 15, 8, 8, 8 + * + * When moving the suffix, we found exactly 3 matches. + * + * The first suffix with period 5 is repeating. + * + * The next event is (3 * max_period) % suffix_period + * + * In this example, the result 0, so the next event is suffix[0] => 8 + * + * However, 8 is the index in the array of exponential moving average + * which was calculated on the fly when storing the values, so the + * interval is ema[8] = 1366 + * + * + * Example 2: + * + * 4, 3, 5, 100, + * 3, 3, 5, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 5, 3, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 4, 5, 112, + * 4, 3, 4, 110 + * + * ilog2 + * + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Max period 5: + * 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Suffixes: + * + * 1) 0, 0, 4, 0, 0 + * 2) 0, 0, 4, 0 + * 3) 0, 0, 4 + * 4) 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | X + * 0, 0, 4, 0, 0, | X + * 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | + * 0, 0, 4, 0, | | | + * 0 0 4 + * + * Pattern is found 3 times, the remaining is 1 which results from + * (max_period * 3) % suffix_period. This value is the index in the + * suffix arrays. The suffix array for a period 4 has the value 4 + * at index 1. + */ +#define EMA_ALPHA_VAL 64 +#define EMA_ALPHA_SHIFT 7 + +#define PREDICTION_PERIOD_MIN 2 +#define PREDICTION_PERIOD_MAX 5 +#define PREDICTION_FACTOR 4 +#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ +#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ + +struct irqt_stat { + u64 last_ts; + u64 ema_time[PREDICTION_BUFFER_SIZE]; + int timings[IRQ_TIMINGS_SIZE]; + int circ_timings[IRQ_TIMINGS_SIZE]; + int count; +}; + +/* + * Exponential moving average computation + */ +static u64 irq_timings_ema_new(u64 value, u64 ema_old) +{ + s64 diff; + + if (unlikely(!ema_old)) + return value; + + diff = (value - ema_old) * EMA_ALPHA_VAL; + /* + * We can use a s64 type variable to be added with the u64 + * ema_old variable as this one will never have its topmost + * bit set, it will be always smaller than 2^63 nanosec + * interrupt interval (292 years). + */ + return ema_old + (diff >> EMA_ALPHA_SHIFT); +} + +static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) +{ + int i; + + /* + * The buffer contains the suite of intervals, in a ilog2 + * basis, we are looking for a repetition. We point the + * beginning of the search three times the length of the + * period beginning at the end of the buffer. We do that for + * each suffix. + */ + for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + + int *begin = &buffer[len - (i * 3)]; + int *ptr = begin; + + /* + * We look if the suite with period 'i' repeat + * itself. If it is truncated at the end, as it + * repeats we can use the period to find out the next + * element. + */ + while (!memcmp(ptr, begin, i * sizeof(*ptr))) { + ptr += i; + if (ptr >= &buffer[len]) + return begin[((i * 3) % i)]; + } + } + + return -1; +} + +static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) +{ + int index, i, period_max, count, start, min = INT_MAX; + + if ((now - irqs->last_ts) >= NSEC_PER_SEC) { + irqs->count = irqs->last_ts = 0; + return U64_MAX; + } + + /* + * As we want to find three times the repetition, we need a + * number of intervals greater or equal to three times the + * maximum period, otherwise we truncate the max period. + */ + period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : irqs->count / 3; + + /* + * If we don't have enough irq timings for this prediction, + * just bail out. + */ + if (period_max <= PREDICTION_PERIOD_MIN) + return U64_MAX; + + /* + * 'count' will depends if the circular buffer wrapped or not + */ + count = irqs->count < IRQ_TIMINGS_SIZE ? + irqs->count : IRQ_TIMINGS_SIZE; + + start = irqs->count < IRQ_TIMINGS_SIZE ? + 0 : (irqs->count & IRQ_TIMINGS_MASK); + + /* + * Copy the content of the circular buffer into another buffer + * in order to linearize the buffer instead of dealing with + * wrapping indexes and shifted array which will be prone to + * error and extremelly difficult to debug. + */ + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + + irqs->timings[i] = irqs->circ_timings[index]; + min = min_t(int, irqs->timings[i], min); + } + + index = irq_timings_next_event_index(irqs->timings, count, period_max); + if (index < 0) + return irqs->last_ts + irqs->ema_time[min]; + + return irqs->last_ts + irqs->ema_time[index]; +} + +static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) +{ + u64 old_ts = irqs->last_ts; + u64 interval; + int index; + + /* + * The timestamps are absolute time values, we need to compute + * the timing interval between two interrupts. + */ + irqs->last_ts = ts; + + /* + * The interval type is u64 in order to deal with the same + * type in our computation, that prevent mindfuck issues with + * overflow, sign and division. + */ + interval = ts - old_ts; + + /* + * The interrupt triggered more than one second apart, that + * ends the sequence as predictible for our purpose. In this + * case, assume we have the beginning of a sequence and the + * timestamp is the first value. As it is impossible to + * predict anything at this point, return. + * + * Note the first timestamp of the sequence will always fall + * in this test because the old_ts is zero. That is what we + * want as we need another timestamp to compute an interval. + */ + if (interval >= NSEC_PER_SEC) { + irqs->count = 0; + return; + } + + /* + * Get the index in the ema table for this interrupt. The + * PREDICTION_FACTOR increase the interval size for the array + * of exponential average. + */ + index = likely(interval) ? + ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; + + /* + * Store the index as an element of the pattern in another + * circular array. + */ + irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; + + irqs->ema_time[index] = irq_timings_ema_new(interval, + irqs->ema_time[index]); + + irqs->count++; +} + /** * irq_timings_next_event - Return when the next event is supposed to arrive * @@ -61,6 +463,12 @@ void irq_timings_disable(void) */ u64 irq_timings_next_event(u64 now) { + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + struct irqt_stat *irqs; + struct irqt_stat __percpu *s; + u64 ts, next_evt = U64_MAX; + int i, irq = 0; + /* * This function must be called with the local irq disabled in * order to prevent the timings circular buffer to be updated @@ -68,7 +476,51 @@ u64 irq_timings_next_event(u64 now) */ lockdep_assert_irqs_disabled(); - return 0; + if (!irqts->count) + return next_evt; + + /* + * Number of elements in the circular buffer: If it happens it + * was flushed before, then the number of elements could be + * smaller than IRQ_TIMINGS_SIZE, so the count is used, + * otherwise the array size is used as we wrapped. The index + * begins from zero when we did not wrap. That could be done + * in a nicer way with the proper circular array structure + * type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + * + * Inject measured irq/timestamp to the pattern prediction + * model while decrementing the counter because we consume the + * data from our circular buffer. + */ + + i = (irqts->count & IRQ_TIMINGS_MASK) - 1; + irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); + + for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + irq = irq_timing_decode(irqts->values[i], &ts); + s = idr_find(&irqt_stats, irq); + if (s) + irq_timings_store(irq, this_cpu_ptr(s), ts); + } + + /* + * Look in the list of interrupts' statistics, the earliest + * next event. + */ + idr_for_each_entry(&irqt_stats, s, i) { + + irqs = this_cpu_ptr(s); + + ts = __irq_timings_next_event(irqs, i, now); + if (ts <= now) + return now; + + if (ts < next_evt) + next_evt = ts; + } + + return next_evt; } void irq_timings_free(int irq) -- cgit v1.2.3 From 9eca544b1491df90ea7102a7ed14acc3c562d97b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Mar 2019 11:33:21 +0100 Subject: cpufreq: schedutil: Simplify iowait boosting There is not reason for the minimum iowait boost value in the schedutil cpufreq governor to depend on the available range of CPU frequencies. In fact, that dependency is generally confusing, because it causes the iowait boost to behave somewhat differently on CPUs with the same maximum frequency and different minimum frequencies, for example. For this reason, replace the min field in struct sugov_cpu with a constant and choose its values to be 1/8 of SCHED_CAPACITY_SCALE (for consistency with the intel_pstate driver's internal governor). [Note that policy->cpuinfo.max_freq will not be a constant any more after a subsequent change, so this change is depended on by it.] Link: https://lore.kernel.org/lkml/20190305083202.GU32494@hirez.programming.kicks-ass.net/T/#ee20bdc98b7d89f6110c0d00e5c3ee8c2ced93c3d Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5c41ea367422..b3a878aa593d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@ #include #include +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -51,7 +53,6 @@ struct sugov_cpu { u64 last_update; unsigned long bw_dl; - unsigned long min; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,7 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -317,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -349,7 +351,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** @@ -389,7 +391,7 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; return util; } @@ -826,9 +828,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->min = - (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) / - policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { -- cgit v1.2.3 From d1be6a28b13ce6d1bc42bf9b6a9454c65839225b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 12:48:44 +0000 Subject: asm-generic/mmiowb: Add generic implementation of mmiowb() tracking In preparation for removing all explicit mmiowb() calls from driver code, implement a tracking system in asm-generic based loosely on the PowerPC implementation. This allows architectures with a non-empty mmiowb() definition to have the barrier automatically inserted in spin_unlock() following a critical section containing an I/O write. Acked-by: Linus Torvalds Signed-off-by: Will Deacon --- kernel/Kconfig.locks | 7 +++++++ kernel/locking/spinlock.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..6ba2570eddad 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP + +config ARCH_HAS_MMIOWB + bool + +config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@ #include #include +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are -- cgit v1.2.3 From 60ca1e5a200cd294a12907fa36dece4241db4ab8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 12:59:59 +0000 Subject: mmiowb: Hook up mmiowb helpers to spinlocks and generic I/O accessors Removing explicit calls to mmiowb() from driver code means that we must now call into the generic mmiowb_spin_{lock,unlock}() functions from the core spinlock code. In order to elide barriers following critical sections without any I/O writes, we also hook into the asm-generic I/O routines. Acked-by: Linus Torvalds Signed-off-by: Will Deacon --- kernel/locking/spinlock_debug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); debug_spin_lock_after(lock); } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&lock->raw_lock); - if (ret) + if (ret) { + mmiowb_spin_lock(); debug_spin_lock_after(lock); + } #ifndef CONFIG_SMP /* * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) void do_raw_spin_unlock(raw_spinlock_t *lock) { + mmiowb_spin_unlock(); debug_spin_unlock(lock); arch_spin_unlock(&lock->raw_lock); } -- cgit v1.2.3 From 12f2639038ef420fe796171ffb810b30d1ac0619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2019 21:46:12 +0200 Subject: tracing: stop making gpio tracing configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpio tracing was made configurable in 4.4-rc1 (commit ddd70280bf0e ("tracing: gpio: Add Kconfig option for enabling/disabling trace events")). Since then it is the only event type that can be compiled conditionally. Given that there is only little overhead I don't understand the reasoning and I was annoyed more than once that gpio events were not available without recompiling. So drop the Kconfig symbol and make gpio events available unconditionally. Signed-off-by: Uwe Kleine-König Signed-off-by: Linus Walleij --- kernel/trace/Kconfig | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8bd1d6d001d7..5d965cef6c77 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -774,13 +774,6 @@ config TRACE_EVAL_MAP_FILE If unsure, say N -config TRACING_EVENTS_GPIO - bool "Trace gpio events" - depends on GPIOLIB - default y - help - Enable tracing events for gpio subsystem - config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL -- cgit v1.2.3 From 1e144d73f7295f766568c357448a11eb12868e29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 16:07:48 -0400 Subject: tracing: Add trace_array parameter to create_event_filter() Pass in the trace_array that represents the instance the filter being changed is in to create_event_filter(). This will allow for error messages that happen when writing to the filter can be displayed in the proper instance "error_log" file. Note, for calls to create_filter() (that was also modified to support create_event_filter()), that changes filters that do not exist in a instance (for perf for example), NULL may be passed in, which means that there will not be any message to log for that filter. Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 ++- kernel/trace/trace_events_filter.c | 25 ++++++++++++++----------- kernel/trace/trace_events_trigger.c | 3 ++- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b711edbef7e7..809c5d7f0064 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1553,7 +1553,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct trace_event_call *call, +extern int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 290d42c59101..2b63930cd3e6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -920,7 +920,8 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static void append_filter_err(struct filter_parse_error *pe, +static void append_filter_err(struct trace_array *tr, + struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; @@ -1607,7 +1608,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (err) { filter_disable(file); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(pe, filter); + append_filter_err(tr, pe, filter); } else event_set_filtered_flag(file); @@ -1719,7 +1720,8 @@ static void create_filter_finish(struct filter_parse_error *pe) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct trace_event_call *call, +static int create_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_string, bool set_str, struct event_filter **filterp) { @@ -1736,17 +1738,18 @@ static int create_filter(struct trace_event_call *call, err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); create_filter_finish(pe); return err; } -int create_event_filter(struct trace_event_call *call, +int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { - return create_filter(call, filter_str, set_str, filterp); + return create_filter(tr, call, filter_str, set_str, filterp); } /** @@ -1773,7 +1776,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); } } create_filter_finish(pe); @@ -1804,7 +1807,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) return 0; } - err = create_filter(call, filter_string, true, &filter); + err = create_filter(file->tr, call, filter_string, true, &filter); /* * Always swap the call filter with the new filter @@ -2060,7 +2063,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - err = create_filter(call, filter_str, false, &filter); + err = create_filter(NULL, call, filter_str, false, &filter); if (err) goto free_filter; @@ -2209,8 +2212,8 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = create_filter(&event_ftrace_test_filter, d->filter, - false, &filter); + err = create_filter(NULL, &event_ftrace_test_filter, + d->filter, false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str, goto out; /* The filter is for the 'trigger' event, not the triggered event */ - ret = create_event_filter(file->event_call, filter_str, false, &filter); + ret = create_event_filter(file->tr, file->event_call, + filter_str, false, &filter); /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. -- cgit v1.2.3 From d0cd871ba0d613e09366e4b6a17946dfcf51db7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 22:30:22 -0400 Subject: tracing: Have histogram code pass around trace_array for error handling Have the trace_array that associates the trace instance of the histogram passed around to functions so that error handling can display the error message in the proper instance. Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 142 ++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 071c62cacba7..a167e439e9a1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -619,7 +619,7 @@ static void last_cmd_set(struct trace_event_file *file, char *str) snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err(u8 err_type, u8 err_pos) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); } @@ -1756,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1807,7 +1807,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, + errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -2042,7 +2043,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; } -static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +static int parse_assignment(struct trace_array *tr, + char *str, struct hist_trigger_attrs *attrs) { int ret = 0; @@ -2098,7 +2100,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str)); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -2115,7 +2117,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) return ret; } -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) { struct hist_trigger_attrs *attrs; int ret = 0; @@ -2128,7 +2131,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) char *str = strsep(&trigger_str, ":"); if (strchr(str, '=')) { - ret = parse_assignment(str, attrs); + ret = parse_assignment(tr, str, attrs); if (ret) goto free; } else if (strcmp(str, "pause") == 0) @@ -2684,6 +2687,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, char *var_name) { struct hist_field *var_field = NULL, *ref_field = NULL; + struct trace_array *tr = hist_data->event_file->tr; if (!is_var_ref(var_name)) return NULL; @@ -2696,7 +2700,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2707,6 +2711,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; + struct trace_array *tr = file->tr; modifier = str = kstrdup(field_str, GFP_KERNEL); if (!modifier) @@ -2730,7 +2735,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2746,7 +2751,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2808,7 +2813,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); if (!s) { - hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + hist_field = parse_var_ref(hist_data, ref_system, + ref_event, ref_var); if (hist_field) { if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); @@ -2857,7 +2863,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2912,7 +2918,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } -static int check_expr_operands(struct hist_field *operand1, +static int check_expr_operands(struct trace_array *tr, + struct hist_field *operand1, struct hist_field *operand2) { unsigned long operand1_flags = operand1->flags; @@ -2940,7 +2947,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0); + hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2958,7 +2965,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -3003,7 +3010,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(operand1, operand2); + ret = check_expr_operands(file->tr, operand1, operand2); if (ret) goto free; @@ -3196,14 +3203,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3216,7 +3223,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3277,7 +3284,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3289,7 +3296,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3422,25 +3429,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, { struct hist_field *val = NULL, *var = NULL; unsigned long flags = HIST_FIELD_FL_VAR; + struct trace_array *tr = file->tr; struct field_var *field_var; int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3767,19 +3775,20 @@ static int track_data_create(struct hist_trigger_data *hist_data, { struct hist_field *var_field, *ref_field, *track_var = NULL; struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; char *track_data_var_str; int ret = 0; track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); + hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); + hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3792,7 +3801,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONMAX) track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3800,7 +3809,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONCHANGE) track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3811,7 +3820,8 @@ static int track_data_create(struct hist_trigger_data *hist_data, return ret; } -static int parse_action_params(char *params, struct action_data *data) +static int parse_action_params(struct trace_array *tr, char *params, + struct action_data *data) { char *param, *saved_param; bool first_param = true; @@ -3819,20 +3829,20 @@ static int parse_action_params(char *params, struct action_data *data) while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_PARAMS, 0); + hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); goto out; } param = strsep(¶ms, ","); if (!param) { - hist_err(HIST_ERR_PARAM_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err(HIST_ERR_INVALID_PARAM, errpos(param)); + hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3856,7 +3866,7 @@ static int parse_action_params(char *params, struct action_data *data) return ret; } -static int action_parse(char *str, struct action_data *data, +static int action_parse(struct trace_array *tr, char *str, struct action_data *data, enum handler_id handler) { char *action_name; @@ -3864,14 +3874,14 @@ static int action_parse(char *str, struct action_data *data, strsep(&str, "."); if (!str) { - hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } action_name = strsep(&str, "("); if (!action_name || !str) { - hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } @@ -3880,12 +3890,12 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!params) { - hist_err(HIST_ERR_NO_SAVE_PARAMS, 0); + hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; goto out; } - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; @@ -3894,7 +3904,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3906,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!str) { - hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params)); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); ret = -EINVAL; goto out; } @@ -3916,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3931,7 +3941,7 @@ static int action_parse(char *str, struct action_data *data, data->use_trace_keyword = true; if (params) { - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; } @@ -3984,7 +3994,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, goto free; } - ret = action_parse(str, data, handler); + ret = action_parse(hist_data->event_file->tr, str, data, handler); if (ret) goto free; out: @@ -4054,6 +4064,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, struct action_data *data, char *system, char *event, char *var) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field; var++; /* skip '$' */ @@ -4069,7 +4080,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, } if (!hist_field) - hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var)); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } @@ -4127,6 +4138,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, static int trace_action_create(struct hist_trigger_data *hist_data, struct action_data *data) { + struct trace_array *tr = hist_data->event_file->tr; char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; @@ -4144,7 +4156,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event = find_synth_event(synth_event_name); if (!event) { - hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); + hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } @@ -4205,14 +4217,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, continue; } - hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); + hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); + hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } @@ -4231,6 +4243,7 @@ static int action_create(struct hist_trigger_data *hist_data, struct action_data *data) { struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; struct track_data *track_data; struct field_var *field_var; unsigned int i; @@ -4258,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; - hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); + hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); goto out; } @@ -4271,7 +4284,8 @@ static int action_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { - hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param)); + hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, + errpos(param)); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -4305,18 +4319,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); + hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); + hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } @@ -4332,7 +4346,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - ret = action_parse(str, data, HANDLER_ONMATCH); + ret = action_parse(tr, str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4401,13 +4415,14 @@ static int create_var_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *var_name, char *expr_str) { + struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name)); + hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4464,8 +4479,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field = NULL; - unsigned long flags = 0; unsigned int key_size; int ret = 0; @@ -4488,7 +4503,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str)); + hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4589,6 +4604,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data) static int parse_var_defs(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; char *s, *str, *var_name, *field_str; unsigned int i, j, n_vars = 0; int ret = 0; @@ -4602,13 +4618,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name)); + hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, + errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name)); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -5829,6 +5846,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, { struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + struct trace_array *tr = file->tr; int ret = 0; if (hist_data->attrs->name) { @@ -5836,7 +5854,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); + hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5857,7 +5875,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err(HIST_ERR_TRIGGER_EEXIST, 0); + hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5865,7 +5883,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); + hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5890,7 +5908,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); + hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } @@ -6108,7 +6126,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } - attrs = parse_hist_trigger_attrs(trigger); + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); -- cgit v1.2.3 From 2f754e771b1a6feba670782e82c45555984ac43b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 22:52:21 -0400 Subject: tracing: Have the error logs show up in the proper instances As each instance has their own error_log file, it makes more sense that the instances show the errors of their own instead of all error_logs having the same data. Make it that the errors show up in the instance error_log file that the error happens in. If no instance trace_array is available, then NULL can be passed in which will create the error in the top level instance (the one at the top of the tracefs directory). Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 55 +++++++++++++++++++++++++------------- kernel/trace/trace.h | 5 +++- kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_events_hist.c | 3 ++- kernel/trace/trace_probe.c | 2 +- 5 files changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7978168f5041..3d55e9daae8c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6897,25 +6897,22 @@ struct tracing_log_err { char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ }; -static LIST_HEAD(tracing_err_log); static DEFINE_MUTEX(tracing_err_log_lock); -static unsigned int n_tracing_err_log_entries; - -struct tracing_log_err *get_tracing_log_err(void) +struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) { struct tracing_log_err *err; - if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) { + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); - n_tracing_err_log_entries++; + tr->n_err_log_entries++; return err; } - err = list_first_entry(&tracing_err_log, struct tracing_log_err, list); + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); list_del(&err->list); return err; @@ -6949,6 +6946,7 @@ unsigned int err_pos(char *cmd, const char *str) /** * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) * @loc: A string describing where the error occurred * @cmd: The tracing command that caused the error * @errs: The array of loc-specific static error strings @@ -6973,13 +6971,17 @@ unsigned int err_pos(char *cmd, const char *str) * existing callers for examples of how static strings are typically * defined for use with tracing_log_err(). */ -void tracing_log_err(const char *loc, const char *cmd, +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, const char **errs, u8 type, u8 pos) { struct tracing_log_err *err; + if (!tr) + tr = &global_trace; + mutex_lock(&tracing_err_log_lock); - err = get_tracing_log_err(); + err = get_tracing_log_err(tr); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; @@ -6993,34 +6995,38 @@ void tracing_log_err(const char *loc, const char *cmd, err->info.pos = pos; err->info.ts = local_clock(); - list_add_tail(&err->list, &tracing_err_log); + list_add_tail(&err->list, &tr->err_log); mutex_unlock(&tracing_err_log_lock); } -static void clear_tracing_err_log(void) +static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; mutex_lock(&tracing_err_log_lock); - list_for_each_entry_safe(err, next, &tracing_err_log, list) { + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); kfree(err); } - n_tracing_err_log_entries = 0; + tr->n_err_log_entries = 0; mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; + mutex_lock(&tracing_err_log_lock); - return seq_list_start(&tracing_err_log, *pos); + return seq_list_start(&tr->err_log, *pos); } static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &tracing_err_log, pos); + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); } static void tracing_err_log_seq_stop(struct seq_file *m, void *v) @@ -7067,15 +7073,25 @@ static const struct seq_operations tracing_err_log_seq_ops = { static int tracing_err_log_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - clear_tracing_err_log(); + clear_tracing_err_log(tr); - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &tracing_err_log_seq_ops); - + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } return ret; } @@ -7091,6 +7107,7 @@ static const struct file_operations tracing_err_log_fops = { .write = tracing_err_log_write, .read = seq_read, .llseek = seq_lseek, + .release = tracing_release_generic_tr, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) @@ -8293,6 +8310,7 @@ struct trace_array *trace_array_create(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9087,6 +9105,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 809c5d7f0064..da00a3d508c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -293,11 +293,13 @@ struct trace_array { int nr_topts; bool clear_trace; int buffer_percent; + unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; @@ -1886,7 +1888,8 @@ extern ssize_t trace_parse_run_command(struct file *file, int (*createfn)(int, char**)); extern unsigned int err_pos(char *cmd, const char *str); -extern void tracing_log_err(const char *loc, const char *cmd, +extern void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, const char **errs, u8 type, u8 pos); /* diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2b63930cd3e6..180ecb390baa 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -949,12 +949,12 @@ static void append_filter_err(struct trace_array *tr, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); - tracing_log_err("event filter parse error", + tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); - tracing_log_err("event filter parse error", + tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, FILT_ERR_ERRNO, 0); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a167e439e9a1..a1136e043f17 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -621,7 +621,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str) static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } static void hist_err_clear(void) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e11f98c49d72..4cc2d467d34c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -186,7 +186,7 @@ void __trace_probe_log_err(int offset, int err_type) } *(p - 1) = '\0'; - tracing_log_err(trace_probe_log.subsystem, command, + tracing_log_err(NULL, trace_probe_log.subsystem, command, trace_probe_err_text, err_type, pos + offset); kfree(command); -- cgit v1.2.3 From a8d655792a32312f6715ac789b860fee50168106 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:25 -0500 Subject: tracing: Add error_log to README Add brief blurb about error_log to the 'Important files' section. Link: http://lkml.kernel.org/r/c81e60f9aded495081231a32d2d1023c4d043a7a.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3d55e9daae8c..2bc18de7f0dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4702,6 +4702,7 @@ static const char readme_msg[] = " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t-change the clock used to order events\n" -- cgit v1.2.3 From 4f5fbd78a7b40bab538ae0d316363530da751e42 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 26 Mar 2019 20:13:11 +0800 Subject: rcu: validate arguments for rcu tracepoints When CONFIG_RCU_TRACE is not set, all these tracepoints are defined as do-nothing macro. We'd better make those inline functions that take proper arguments. As RCU_TRACE() is defined as do-nothing marco as well when CONFIG_RCU_TRACE is not set, so we can clean it up. Link: http://lkml.kernel.org/r/1553602391-11926-4-git-send-email-laoar.shao@gmail.com Reviewed-by: Paul E. McKenney Signed-off-by: Yafang Shao Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/rcu.h | 9 ++------- kernel/rcu/tree.c | 8 ++++---- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..442ace406ac9 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -11,11 +11,6 @@ #define __LINUX_RCU_H #include -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) + trace_rcu_invoke_kfree_callback(rn, head, offset); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head);) + trace_rcu_invoke_callback(rn, head); f = head->func; WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..906563a1cdea 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2352,14 +2352,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp) */ int rcutree_dying_cpu(unsigned int cpu) { - RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) - RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) + bool blkd; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; - RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; -- cgit v1.2.3 From c13edf8106f6ad1edb9b7e011351fbaf83ceb992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Leger?= Date: Wed, 27 Mar 2019 14:06:27 +0100 Subject: dma: select GENERIC_ALLOCATOR for DMA_REMAP When DMA_REMAP is enabled, code in remap.c needs generic allocator. It currently worked since few architectures uses it (arm64, csky) and they both select GENERIC_ALLOCATOR. Select it when using DMA_REMAP to have correct dependencies. Signed-off-by: Clement Leger Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a06ba3013b3b..52b704e2b97a 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -57,6 +57,7 @@ config SWIOTLB config DMA_REMAP depends on MMU + select GENERIC_ALLOCATOR bool config DMA_DIRECT_REMAP -- cgit v1.2.3 From d7e02a931235de0779d44c6f8d211df0eca304b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:45:21 +0100 Subject: dma-mapping: remove leftover NULL device support Most dma_map_ops implementations already had some issues with a NULL device, or did simply crash if one was fed to them. Now that we have cleaned up all the obvious offenders we can stop to pretend we support this mode. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index fcdb23e8d2fc..2c2772e9702a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - (!dev || dma_capable(dev, dma_addr, size)); + dma_capable(dev, dma_addr, size); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, -- cgit v1.2.3 From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Dec 2018 14:32:02 +0100 Subject: x86/dma: Remove the x86_dma_fallback_dev hack Now that we removed support for the NULL device argument in the DMA API, there is no need to cater for that in the x86 code. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c000906348c9..685a53f2a793 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev) } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif - void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { @@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (dma_is_direct(ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) -- cgit v1.2.3 From 24acfb71822566e4d469b4992a7b3b9f873e0083 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Mar 2019 17:55:47 +0100 Subject: workqueue: Use normal rcu There is no need for sched_rcu. The undocumented reason why sched_rcu is used is to avoid a few explicit rcu_read_lock()/unlock() pairs by the fact that sched_rcu reader side critical sections are also protected by preempt or irq disabled regions. Replace rcu_read_lock_sched with rcu_read_lock and acquire the RCU lock where it is not yet explicit acquired. Replace local_irq_disable() with rcu_read_lock(). Update asserts. Signed-off-by: Thomas Gleixner [bigeasy: mangle changelog a little] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 93 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 21721faa923c..37a32884986b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -127,16 +127,16 @@ enum { * * PL: wq_pool_mutex protected. * - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or - * sched-RCU for reads. + * RCU for reads. * * WQ: wq->mutex protected. * - * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -183,7 +183,7 @@ struct worker_pool { atomic_t nr_running ____cacheline_aligned_in_smp; /* - * Destruction of pool is sched-RCU protected to allow dereferences + * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; @@ -212,7 +212,7 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also sched-RCU protected so that the first pwq can be + * itself is also RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; @@ -266,8 +266,8 @@ struct workqueue_struct { char name[WQ_NAME_LEN]; /* I: workqueue name */ /* - * Destruction of workqueue_struct is sched-RCU protected to allow - * walking the workqueues list without grabbing wq_pool_mutex. + * Destruction of workqueue_struct is RCU protected to allow walking + * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; @@ -359,20 +359,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + "RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -384,7 +384,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_pool_mutex held or sched RCU read + * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * @@ -416,7 +416,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with wq->mutex held or sched RCU read locked. + * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -552,7 +552,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * This must be called with any of wq_pool_mutex, wq->mutex or RCU * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. @@ -696,8 +696,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under wq_pool_mutex or with preemption disabled. + * access under RCU read lock. As such, this function should be + * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -1133,7 +1133,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* - * As both pwqs and pools are sched-RCU protected, the + * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ spin_lock_irq(&pwq->pool->lock); @@ -1261,6 +1261,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; + rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1299,10 +1300,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); + rcu_read_unlock(); return 1; } spin_unlock(&pool->lock); fail: + rcu_read_unlock(); local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; @@ -1416,6 +1419,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; + rcu_read_lock(); retry: if (req_cpu == WORK_CPU_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); @@ -1472,10 +1476,8 @@ retry: /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); - if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&pwq->pool->lock); - return; - } + if (WARN_ON(!list_empty(&work->entry))) + goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); @@ -1493,7 +1495,9 @@ retry: insert_work(pwq, work, worklist, work_flags); +out: spin_unlock(&pwq->pool->lock); + rcu_read_unlock(); } /** @@ -2975,14 +2979,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, might_sleep(); - local_irq_disable(); + rcu_read_lock(); pool = get_work_pool(work); if (!pool) { - local_irq_enable(); + rcu_read_unlock(); return false; } - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3014,10 +3018,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } - + rcu_read_unlock(); return true; already_gone: spin_unlock_irq(&pool->lock); + rcu_read_unlock(); return false; } @@ -3504,7 +3509,7 @@ static void rcu_free_pool(struct rcu_head *rcu) * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). @@ -3558,7 +3563,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); - /* sched-RCU protected to allow dereferences from get_work_pool() */ + /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } @@ -4472,7 +4477,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock_sched(); + rcu_read_lock(); + preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); @@ -4483,7 +4489,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); - rcu_read_unlock_sched(); + preempt_enable(); + rcu_read_unlock(); return ret; } @@ -4509,15 +4516,15 @@ unsigned int work_busy(struct work_struct *work) if (work_pending(work)) ret |= WORK_BUSY_PENDING; - local_irq_save(flags); + rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - local_irq_restore(flags); + rcu_read_unlock(); return ret; } @@ -4701,7 +4708,7 @@ void show_workqueue_state(void) unsigned long flags; int pi; - rcu_read_lock_sched(); + rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); @@ -4766,7 +4773,7 @@ void show_workqueue_state(void) touch_nmi_watchdog(); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ @@ -5153,16 +5160,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - rcu_read_lock_sched(); + rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - rcu_read_unlock_sched(); + rcu_read_unlock(); goto out_unlock; } } - rcu_read_unlock_sched(); + rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); @@ -5357,7 +5364,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - rcu_read_lock_sched(); + get_online_cpus(); + rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, @@ -5365,7 +5373,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); + rcu_read_unlock(); + put_online_cpus(); return written; } -- cgit v1.2.3 From 3b15d09f7e6db44065aaba5fd16dc7420035c5ad Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 28 Feb 2019 13:13:26 +0800 Subject: time: Introduce jiffies64_to_msecs() there is a similar helper in net/netfilter/nf_tables_api.c, this maybe become a common request someday, so move it to time.c Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Acked-by: John Stultz Signed-off-by: Pablo Neira Ayuso --- kernel/time/time.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..9e3f79d4f5a8 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -783,6 +783,16 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#else + return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * -- cgit v1.2.3 From 699c1868a743f530081f429058616a2dd5d8a4b2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 8 Apr 2019 12:50:57 -0400 Subject: audit: purge unnecessary list_empty calls The original conditions that led to the use of list_empty() to optimize list_for_each_entry_rcu() in auditfilter.c and auditsc.c code have been removed without removing the list_empty() call, but this code example has been copied several times. Remove the unnecessary list_empty() calls. Please see upstream github issue https://github.com/linux-audit/audit-kernel/issues/112 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 2 -- kernel/auditsc.c | 64 ++++++++++++++++++++++------------------------------ 2 files changed, 27 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..2c3c2f349b23 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1315,8 +1315,6 @@ int audit_filter(int msgtype, unsigned int listtype) int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[listtype])) - goto unlock_and_return; list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 98a98e6dca05..51a2ceb3a1ca 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -771,15 +771,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_DISABLED; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return state; - } + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, ctx->major) && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return state; } } rcu_read_unlock(); @@ -798,9 +796,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (list_empty(list)) - return 0; - list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { @@ -808,7 +803,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, return 1; } } - return 0; } @@ -1945,18 +1939,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(inode->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -2065,18 +2057,16 @@ void __audit_inode_child(struct inode *parent, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(parent->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } -- cgit v1.2.3 From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 25 Mar 2019 21:32:28 +0200 Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS, respectively %pF and %pf are functionally equivalent to %pS and %ps conversion specifiers. The former are deprecated, therefore switch the current users to use the preferred variant. The changes have been produced by the following command: git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \ while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done And verifying the result. Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com Cc: Andy Shevchenko Cc: linux-arm-kernel@lists.infradead.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: xen-devel@lists.xenproject.org Cc: linux-acpi@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvdimm@lists.01.org Cc: linux-pci@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-btrfs@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-mm@kvack.org Cc: ceph-devel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Sakari Ailus Acked-by: David Sterba (for btrfs) Acked-by: Mike Rapoport (for mm/memblock.c) Acked-by: Bjorn Helgaas (for drivers/pci) Acked-by: Rafael J. Wysocki Signed-off-by: Petr Mladek --- kernel/async.c | 4 ++-- kernel/events/uprobes.c | 2 +- kernel/fail_function.c | 2 +- kernel/irq/debugfs.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/manage.c | 2 +- kernel/irq/spurious.c | 4 ++-- kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timer.c | 2 +- kernel/workqueue.c | 12 ++++++------ 12 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index affa830a198c..48abc0f18eae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..ec43ab2fdfda 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -778,7 +778,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..8eee921b384d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2870,7 +2870,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..1002cf61700a 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..6502c3ed317e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1328,7 +1328,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7abbeed13421..8ea9e4fb8cc6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2277,7 +2277,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -4582,7 +4582,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4607,7 +4607,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4639,7 +4639,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); -- cgit v1.2.3 From 6f9b83ac877fb5558d76b9f78590f3afd1bdf421 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 27 Mar 2019 15:35:47 +0100 Subject: cpuidle: Export the next timer expiration for CPUs To be able to predict the sleep duration for a CPU entering idle, it is essential to know the expiration time of the next timer. Both the teo and the menu cpuidle governors already use this information for CPU idle state selection. Moving forward, a similar prediction needs to be made for a group of idle CPUs rather than for a single one and the following changes implement a new genpd governor for that purpose. In order to support that feature, add a new function called tick_nohz_get_next_hrtimer() that will return the next hrtimer expiration time of a given CPU to be invoked after deciding whether or not to stop the scheduler tick on that CPU. Make the cpuidle core call tick_nohz_get_next_hrtimer() right before invoking the ->enter() callback provided by the cpuidle driver for the given state and store its return value in the per-CPU struct cpuidle_device, so as to make it available to code outside of cpuidle. Note that at the point when cpuidle calls tick_nohz_get_next_hrtimer(), the governor's ->select() callback has already returned and indicated whether or not the tick should be stopped, so in fact the value returned by tick_nohz_get_next_hrtimer() always is the next hrtimer expiration time for the given CPU, possibly including the tick (if it hasn't been stopped). Co-developed-by: Lina Iyer Co-developed-by: Daniel Lezcano Acked-by: Daniel Lezcano Signed-off-by: Ulf Hansson [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..8d18e03124ff 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1022,6 +1022,18 @@ bool tick_nohz_idle_got_tick(void) return false; } +/** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ + return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped -- cgit v1.2.3 From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 32 +++++++++++++++++++ kernel/bpf/core.c | 3 +- kernel/bpf/disasm.c | 5 +-- kernel/bpf/syscall.c | 28 ++++++++++++----- kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++++++++++++++------------ 5 files changed, 124 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..1a6e9861d554 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2966cb368bf4..ace8c22c8b0e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index de73f55e42fd..d9ce383c0f9c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d65e56594db..828518bb947b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48718e1da16d..6ab7a23fc924 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || - insn[1].imm != 0) { - verbose(env, "unrecognized bpf_ld_imm64 insn\n"); + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && -- cgit v1.2.3 From be70bcd53de66e86f2726e576307cbdaebd3b1a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:04 +0200 Subject: bpf: do not retain flags that are not tied to map lifetime Both BPF_F_WRONLY / BPF_F_RDONLY flags are tied to the map file descriptor, but not to the map object itself! Meaning, at map creation time BPF_F_RDONLY can be set to make the map read-only from syscall side, but this holds only for the returned fd, so any other fd either retrieved via bpf file system or via map id for the very same underlying map object can have read-write access instead. Given that, keeping the two flags around in the map_flags attribute and exposing them to user space upon map dump is misleading and may lead to false conclusions. Since these two flags are not tied to the map object lets also not store them as map property. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 828518bb947b..56b4b0e08b3b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,13 +166,25 @@ void bpf_map_area_free(void *area) kvfree(area); } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ + /* Some map creation flags are not tied to the map object but + * rather to the map fd instead, so they have no meaning upon + * map object inspection since multiple file descriptors with + * different (access) properties can exist here. Thus, given + * this has zero meaning for the map itself, lets clear these + * from here. + */ + return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} + void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; + map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); } -- cgit v1.2.3 From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 6 +++++- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 6 +++--- kernel/bpf/lpm_trie.c | 3 ++- kernel/bpf/queue_stack_maps.c | 6 +++--- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++++++++++-- 7 files changed, 62 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1a6e9861d554..217b10bd9f48 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -472,6 +473,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fed15cf94dca..192d32e77db3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 93a5cbbde421..e61630c2e50b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -538,7 +538,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 56b4b0e08b3b..0c9276b54c88 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -501,6 +501,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->spin_lock_off = btf_find_spin_lock(btf, value_type); if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ab7a23fc924..b747434df89c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1439,6 +1439,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -2024,7 +2046,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -2327,6 +2351,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -3059,6 +3087,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -3069,11 +3098,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); -- cgit v1.2.3 From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c9276b54c88..b3ce516e5a20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -355,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -376,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -727,8 +741,7 @@ static int map_lookup_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -857,8 +870,7 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -969,8 +981,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1021,8 +1032,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1089,8 +1099,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1132,6 +1141,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2735,6 +2774,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; -- cgit v1.2.3 From 3e0ddc4f3ff1436970e96e76f3df3c3b5f5173b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:07 +0200 Subject: bpf: allow . char as part of the object name Trivial addition to allow '.' aside from '_' as "special" characters in the object name. Used to allow for substrings in maps from loader side such as ".bss", ".data", ".rodata", but could also be useful for other purposes. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b3ce516e5a20..198c9680bf0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -474,10 +474,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src) const char *end = src + BPF_OBJ_NAME_LEN; memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ + /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { - if (!isalnum(*src) && *src != '_') + if (!isalnum(*src) && + *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } -- cgit v1.2.3 From 1dc92851849cc2235a1efef8f8d5a9255efc5f13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:09 +0200 Subject: bpf: kernel side support for BTF Var and DataSec This work adds kernel-side verification, logging and seq_show dumping of BTF Var and DataSec kinds which are emitted with latest LLVM. The following constraints apply: BTF Var must have: - Its kind_flag is 0 - Its vlen is 0 - Must point to a valid type - Type must not resolve to a forward type - Size of underlying type must be > 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. in case of static variables inside functions) - Cannot be a member of a struct/union - Linkage so far can either only be static or global/allocated BTF DataSec must have: - Its kind_flag is 0 - Its vlen cannot be 0 - Its size cannot be 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. to represent .bss, .data, .rodata etc) - Cannot be a member of a struct/union - Inner btf_var_secinfo array with {type,offset,size} triple must be sorted by offset in ascending order - Type must always point to BTF Var - BTF resolved size of Var must be <= size provided by triple - DataSec size must be >= sum of triple sizes (thus holes are allowed) btf_var_resolve(), btf_ptr_resolve() and btf_modifier_resolve() are on a high level quite similar but each come with slight, subtle differences. They could potentially be a bit refactored in future which hasn't been done here to ease review. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 397 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd3921b1514b..0cecf6bab61b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -185,6 +185,16 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, struct_type, member) \ + for (i = 0, member = btf_type_var_secinfo(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", }; struct btf_kind_operations { @@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ + return btf_type_is_var(t) || + btf_type_is_datasec(t); +} + /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same @@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t) static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t) || + btf_type_is_var(t) || + btf_type_is_datasec(t); } /* t->size can be used */ @@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_DATASEC: return true; } @@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t) return (const struct btf_enum *)(t + 1); } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ + return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ + if ((first ? !isalpha(c) : + !isalnum(c)) && + c != '_' && + ((c == '.' && !dot_ok) || + c != '.')) + return false; + return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ const char *src = &btf->strings[offset]; const char *src_limit; - if (!isalpha(*src) && *src != '_') + if (!__btf_name_char_ok(*src, true, dot_ok)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') + if (!__btf_name_char_ok(*src, false, dot_ok)) return false; src++; } @@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, true); +} + static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, + const struct btf_type *datasec_type, + const struct btf_var_secinfo *vsi, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + if (env->phase != CHECK_META) + btf_verifier_log_type(env, datasec_type, NULL); + + __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", + vsi->type, vsi->offset, vsi->size); + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { @@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) return NULL; size = btf->resolved_sizes[size_type_id]; @@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return 0; } +static int btf_var_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type || btf_type_is_resolve_source_only(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + /* We must resolve to something concrete at this point, no + * forward types or similar that would resolve to size of + * zero is allowed. + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf, btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { + if (btf_type_is_resolve_source_only(index_type) || + btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { + if (btf_type_is_resolve_source_only(elem_type) || + btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_nosize_or_null(member_type)) { + if (btf_type_is_resolve_source_only(member_type) || + btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = { .seq_show = btf_df_seq_show, }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var *var; + u32 meta_needed = sizeof(*var); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !__btf_name_valid(env->btf, t->name_off, true)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + /* A var cannot be in type void */ + if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + var = btf_type_var(t); + if (var->linkage != BTF_VAR_STATIC && + var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + btf_verifier_log_type(env, t, "Linkage not supported"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ + const struct btf_var *var = btf_type_var(t); + + btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { + .check_meta = btf_var_check_meta, + .resolve = btf_var_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_var_log, + .seq_show = btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var_secinfo *vsi; + u64 last_vsi_end_off = 0, sum = 0; + u32 i, meta_needed; + + meta_needed = btf_type_vlen(t) * sizeof(*vsi); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (!btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen == 0"); + return -EINVAL; + } + + if (!t->size) { + btf_verifier_log_type(env, t, "size == 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !btf_name_valid_section(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_vsi(i, t, vsi) { + /* A var cannot be in type void */ + if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid type_id"); + return -EINVAL; + } + + if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset"); + return -EINVAL; + } + + if (!vsi->size || vsi->size > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid size"); + return -EINVAL; + } + + last_vsi_end_off = vsi->offset + vsi->size; + if (last_vsi_end_off > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset+size"); + return -EINVAL; + } + + btf_verifier_log_vsi(env, t, vsi, NULL); + sum += vsi->size; + } + + if (t->size < sum) { + btf_verifier_log_type(env, t, "Invalid btf_info size"); + return -EINVAL; + } + + return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_var_secinfo *vsi; + struct btf *btf = env->btf; + u16 i; + + for_each_vsi_from(i, v->next_member, v->t, vsi) { + u32 var_type_id = vsi->type, type_id, type_size = 0; + const struct btf_type *var_type = btf_type_by_id(env->btf, + var_type_id); + if (!var_type || !btf_type_is_var(var_type)) { + btf_verifier_log_vsi(env, v->t, vsi, + "Not a VAR kind member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, var_type) && + !env_type_is_resolved(env, var_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, var_type, var_type_id); + } + + type_id = var_type->type; + if (!btf_type_id_size(btf, &type_id, &type_size)) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); + return -EINVAL; + } + + if (vsi->size < type_size) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); + return -EINVAL; + } + } + + env_stack_pop_resolved(env, 0, 0); + return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + u32 i; + + seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + for_each_vsi(i, t, vsi) { + var = btf_type_by_id(btf, vsi->type); + if (i) + seq_puts(m, ","); + btf_type_ops(var)->seq_show(btf, var, vsi->type, + data + vsi->offset, bits_offset, m); + } + seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { + .check_meta = btf_datasec_check_meta, + .resolve = btf_datasec_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_datasec_log, + .seq_show = btf_datasec_seq_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, + [BTF_KIND_VAR] = &var_ops, + [BTF_KIND_DATASEC] = &datasec_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, if (!env_type_is_resolved(env, type_id)) return false; - if (btf_type_is_struct(t)) + if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + !btf->resolved_sizes[type_id]; - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || + btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); + return t && + !btf_type_is_modifier(t) && + !btf_type_is_var(t) && + !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { -- cgit v1.2.3 From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:10 +0200 Subject: bpf: allow for key-less BTF in array map Given we'll be reusing BPF array maps for global data/bss/rodata sections, we need a way to associate BTF DataSec type as its map value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR() macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR") to get initial map to type association going. While more use cases for it are discouraged, this also won't work for global data since the use of array map is a BPF loader detail and therefore unknown at compilation time. For array maps with just a single entry we make an exception in terms of BTF in that key type is declared optional if value type is of DataSec type. The latter LLVM is guaranteed to emit and it also aligns with how we regard global data maps as just a plain buffer area reusing existing map facilities for allowing things like introspection with existing tools. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 15 ++++++++++++++- kernel/bpf/btf.c | 2 +- kernel/bpf/syscall.c | 15 +++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 217b10bd9f48..584636c9e2eb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -391,7 +391,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -428,6 +429,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0cecf6bab61b..cad09858a5f2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -326,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 198c9680bf0d..438199e2eca4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -504,9 +504,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 key_size, value_size; int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; + /* Some maps allow key to be unspecified. */ + if (btf_key_id) { + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + } else { + key_type = btf_type_by_id(btf, 0); + if (!map->ops->map_check_btf) + return -EINVAL; + } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) @@ -573,7 +580,7 @@ static int map_create(union bpf_attr *attr) if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + if (!attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } -- cgit v1.2.3 From d8743230c9f4e92f370ecd2a90c680ddcede6ae5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 9 Apr 2019 18:35:46 +0100 Subject: sched/topology: Fix build_sched_groups() comment The comment was introduced (pre 2.6.12) by: 8a7a2318dc07 ("[PATCH] sched: consolidate sched domains") and referred to sched_group->cpu_power. This was folded into sched_group->sched_group_power in commit 9c3f75cbd144 ("sched: Break out cpu_power from the sched_group structure") The comment was then updated in: ced549fa5fc1 ("sched: Remove remaining dubious usage of "power"") but should have replaced "sg->cpu_capacity" with "sg->sched_group_capacity". Do that now. Signed-off-by: Valentin Schneider Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Link: http://lkml.kernel.org/r/20190409173546.4747-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64bec54ded3e..90e1a870fb0d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1087,8 +1087,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) /* * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc. * * Assumes the sched_domain tree is fully constructed */ -- cgit v1.2.3 From 67d4f6ff2fb69e02bd6365a91ca3939b7a14deac Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 9 Apr 2019 18:35:45 +0100 Subject: sched/topology: Skip duplicate group rewrites in build_sched_groups() While staring at build_sched_domains(), I realized that get_group() does several duplicate (thus useless) writes. If you take the Arm Juno r0 (LITTLEs = [0, 3, 4, 5], bigs = [1, 2]), the sched_group build flow would look like this: ('MC[cpu]->sg' means 'per_cpu_ptr(&tl->data->sg, cpu)' with 'tl == MC') build_sched_groups(MC[CPU0]->sd, CPU0) get_group(0) -> MC[CPU0]->sg get_group(3) -> MC[CPU3]->sg get_group(4) -> MC[CPU4]->sg get_group(5) -> MC[CPU5]->sg build_sched_groups(DIE[CPU0]->sd, CPU0) get_group(0) -> DIE[CPU0]->sg get_group(1) -> DIE[CPU1]->sg <=================+ | build_sched_groups(MC[CPU1]->sd, CPU1) | get_group(1) -> MC[CPU1]->sg | get_group(2) -> MC[CPU2]->sg | | build_sched_groups(DIE[CPU1]->sd, CPU1) ^ get_group(1) -> DIE[CPU1]->sg } We've set up these two up here! get_group(3) -> DIE[CPU0]->sg } From this point on, we will only use sched_groups that have been previously visited & initialized. The only new operation will be which group pointer we affect to sd->groups. On the Juno r0 we get 32 get_group() calls, every single one of them writing to a sched_group->cpumask. However, all of the data structures we need are set up after 8 visits (see above). Return early from get_group() if we've already visited (and thus initialized) the sched_group we're looking at. Overlapping domains are not affected as they do not use build_sched_groups(). Tested on a Juno and a 2 * (Xeon E5-2690) system. ( FWIW I initially checked the refs for both sg && sg->sgc, but figured if they weren't both 0 or > 1 then something must have gone wrong, so I threw in a WARN_ON(). ) No change in functionality intended. Signed-off-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 90e1a870fb0d..c65b31e9458b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; struct sched_group *sg; + bool already_visited; if (child) cpu = cpumask_first(sched_domain_span(child)); @@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg = *per_cpu_ptr(sdd->sg, cpu); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); - /* For claim_allocations: */ - atomic_inc(&sg->ref); - atomic_inc(&sg->sgc->ref); + /* Increase refcounts for claim_allocations: */ + already_visited = atomic_inc_return(&sg->ref) > 1; + /* sgc visits should follow a similar trend as sg */ + WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + + /* If we have already visited that group, it's already initialized. */ + if (already_visited) + return sg; if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); -- cgit v1.2.3 From eecec78f777742903ec9167490c625661284155d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:10 -0400 Subject: locking/rwsem: Relocate rwsem_down_read_failed() The rwsem_down_read_failed*() functions were relocated from above the optimistic spinning section to below that section. This enables the reader functions to use optimisitic spinning in future patches. There is no code change. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20190404174320.22416-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 172 ++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fbe96341beee..0d518b52ade4 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -224,92 +224,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); } -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. - */ - if (atomic_long_read(&sem->count) >= 0) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - adjustment += RWSEM_WAITING_BIAS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - } - - __set_current_state(TASK_RUNNING); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the @@ -504,6 +418,92 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) } #endif +/* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_ACTIVE_READ_BIAS has already + * been set in the count. + */ + if (atomic_long_read(&sem->count) >= 0) { + raw_spin_unlock_irq(&sem->wait_lock); + return sem; + } + adjustment += RWSEM_WAITING_BIAS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + } + + __set_current_state(TASK_RUNNING); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + /* * Wait until we successfully acquire the write lock */ -- cgit v1.2.3 From c7580c1e84435c9ccc6c612d9fee8e71811f7be6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:11 -0400 Subject: locking/rwsem: Move owner setting code from rwsem.c to rwsem.h Move all the owner setting code closer to the rwsem-xadd fast paths directly within rwsem.h file as well as in the slowpaths where owner setting is done after acquring the lock. This will enable us to add DEBUG_RWSEMS check in a later patch to make sure that read lock is really acquired when rwsem_down_read_failed() returns, for instance. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-3-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 6 +++--- kernel/locking/rwsem.c | 19 ++----------------- kernel/locking/rwsem.h | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0d518b52ade4..c213869e1aa7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -176,9 +176,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, goto try_reader_grant; } /* - * It is not really necessary to set it to reader-owned here, - * but it gives the spinners an early indication that the - * readers now have the lock. + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. */ __rwsem_set_reader_owned(sem, waiter->task); } @@ -441,6 +440,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) */ if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); return sem; } adjustment += RWSEM_WAITING_BIAS; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..59e584895532 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_reader_owned(sem); return 0; } @@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_reader_owned(sem); - } return ret; } @@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_owner(sem); return 0; } @@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_owner(sem); - } return ret; } @@ -119,7 +111,6 @@ void up_read(struct rw_semaphore *sem) rwsem_release(&sem->dep_map, 1, _RET_IP_); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); - rwsem_clear_reader_owned(sem); __up_read(sem); } @@ -133,7 +124,6 @@ void up_write(struct rw_semaphore *sem) rwsem_release(&sem->dep_map, 1, _RET_IP_); DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_clear_owner(sem); __up_write(sem); } @@ -147,7 +137,6 @@ void downgrade_write(struct rw_semaphore *sem) lock_downgrade(&sem->dep_map, _RET_IP_); DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -161,7 +150,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); @@ -172,7 +160,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -193,7 +180,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); @@ -208,7 +194,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) return -EINTR; } - rwsem_set_owner(sem); return 0; } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 1f5775aa6a1d..ee24c4f257a5 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -160,6 +160,8 @@ static inline void __down_read(struct rw_semaphore *sem) { if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) rwsem_down_read_failed(sem); + else + rwsem_set_reader_owned(sem); } static inline int __down_read_killable(struct rw_semaphore *sem) @@ -167,8 +169,9 @@ static inline int __down_read_killable(struct rw_semaphore *sem) if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { if (IS_ERR(rwsem_down_read_failed_killable(sem))) return -EINTR; + } else { + rwsem_set_reader_owned(sem); } - return 0; } @@ -182,6 +185,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { + rwsem_set_reader_owned(sem); return 1; } } while (tmp >= 0); @@ -199,6 +203,7 @@ static inline void __down_write(struct rw_semaphore *sem) &sem->count); if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) rwsem_down_write_failed(sem); + rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) @@ -210,6 +215,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem) if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; + rwsem_set_owner(sem); return 0; } @@ -219,7 +225,11 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; + if (tmp == RWSEM_UNLOCKED_VALUE) { + rwsem_set_owner(sem); + return true; + } + return false; } /* @@ -229,6 +239,7 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; + rwsem_clear_reader_owned(sem); tmp = atomic_long_dec_return_release(&sem->count); if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) rwsem_wake(sem); @@ -239,6 +250,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + rwsem_clear_owner(sem); if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, &sem->count) < 0)) rwsem_wake(sem); @@ -259,6 +271,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * write side. As such, rely on RELEASE semantics. */ tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + rwsem_set_reader_owned(sem); if (tmp < 0) rwsem_downgrade_wake(sem); } -- cgit v1.2.3 From 12a30a7fc142a123c61da9623bd824d95d36c12e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:12 -0400 Subject: locking/rwsem: Move rwsem internal function declarations to rwsem-xadd.h We don't need to expose rwsem internal functions which are not supposed to be called directly from other kernel code. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20190404174320.22416-4-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index ee24c4f257a5..19997c82270b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -153,6 +153,13 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + /* * lock for reading */ -- cgit v1.2.3 From a338ecb07a338c9a8b0ca0010e862ebe598b1551 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:13 -0400 Subject: locking/rwsem: Micro-optimize rwsem_try_read_lock_unqueued() The atomic_long_cmpxchg_acquire() in rwsem_try_read_lock_unqueued() is replaced by atomic_long_try_cmpxchg_acquire() to simpify the code and generate slightly better assembly code. There is no functional change. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20190404174320.22416-5-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index c213869e1aa7..f6198e1a58f6 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -259,21 +259,16 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = atomic_long_read(&sem->count); + long count = atomic_long_read(&sem->count); - while (true) { - if (!(count == 0 || count == RWSEM_WAITING_BIAS)) - return false; - - old = atomic_long_cmpxchg_acquire(&sem->count, count, - count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) { + while (!count || count == RWSEM_WAITING_BIAS) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count + RWSEM_ACTIVE_WRITE_BIAS)) { rwsem_set_owner(sem); return true; } - - count = old; } + return false; } static inline bool owner_on_cpu(struct task_struct *owner) -- cgit v1.2.3 From a68e2c4c637918da47b3aa270051545cff7d8245 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:14 -0400 Subject: locking/rwsem: Add debug check for __down_read*() When rwsem_down_read_failed*() return, the read lock is acquired indirectly by others. So debug checks are added in __down_read() and __down_read_killable() to make sure the rwsem is really reader-owned. The other debug check calls in kernel/locking/rwsem.c except the one in up_read_non_owner() are also moved over to rwsem-xadd.h. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-6-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 3 --- kernel/locking/rwsem.h | 12 ++++++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 59e584895532..90de5f1780ba 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -109,7 +109,6 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); __up_read(sem); } @@ -122,7 +121,6 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); __up_write(sem); } @@ -135,7 +133,6 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); __downgrade_write(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 19997c82270b..1d8f722a6761 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -165,10 +165,13 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); */ static inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { rwsem_down_read_failed(sem); - else + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED)); + } else { rwsem_set_reader_owned(sem); + } } static inline int __down_read_killable(struct rw_semaphore *sem) @@ -176,6 +179,8 @@ static inline int __down_read_killable(struct rw_semaphore *sem) if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { if (IS_ERR(rwsem_down_read_failed_killable(sem))) return -EINTR; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED)); } else { rwsem_set_reader_owned(sem); } @@ -246,6 +251,7 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); rwsem_clear_reader_owned(sem); tmp = atomic_long_dec_return_release(&sem->count); if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) @@ -257,6 +263,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + DEBUG_RWSEMS_WARN_ON(sem->owner != current); rwsem_clear_owner(sem); if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, &sem->count) < 0)) @@ -277,6 +284,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ + DEBUG_RWSEMS_WARN_ON(sem->owner != current); tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); rwsem_set_reader_owned(sem); if (tmp < 0) -- cgit v1.2.3 From 3b4ba6643d26a95e08067fca9a5da1828f9afabf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:15 -0400 Subject: locking/rwsem: Enhance DEBUG_RWSEMS_WARN_ON() macro Currently, the DEBUG_RWSEMS_WARN_ON() macro just dumps a stack trace when the rwsem isn't in the right state. It does not show the actual states of the rwsem. This may not be that helpful in the debugging process. Enhance the DEBUG_RWSEMS_WARN_ON() macro to also show the current content of the rwsem count and owner fields to give more information about what is wrong with the rwsem. The debug_locks_off() function is called as is done inside DEBUG_LOCKS_WARN_ON(). Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-7-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 3 ++- kernel/locking/rwsem.h | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 90de5f1780ba..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -198,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 1d8f722a6761..3059a2dc39f8 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -27,9 +27,15 @@ #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) #ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (long)((sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) #else -# define DEBUG_RWSEMS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem) #endif /* @@ -168,7 +174,7 @@ static inline void __down_read(struct rw_semaphore *sem) if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { rwsem_down_read_failed(sem); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED)); + RWSEM_READER_OWNED), sem); } else { rwsem_set_reader_owned(sem); } @@ -180,7 +186,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) if (IS_ERR(rwsem_down_read_failed_killable(sem))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED)); + RWSEM_READER_OWNED), sem); } else { rwsem_set_reader_owned(sem); } @@ -251,7 +257,8 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_dec_return_release(&sem->count); if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) @@ -263,7 +270,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(sem->owner != current); + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); rwsem_clear_owner(sem); if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, &sem->count) < 0)) @@ -284,7 +291,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current); + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); rwsem_set_reader_owned(sem); if (tmp < 0) -- cgit v1.2.3 From ad53fa10fa9e816067bbae7109845940f5e6df50 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:16 -0400 Subject: locking/qspinlock_stat: Introduce generic lockevent_*() counting APIs The percpu event counts used by qspinlock code can be useful for other locking code as well. So a new set of lockevent_* counting APIs is introduced with the lock event names extracted out into the new lock_events_list.h header file for easier addition in the future. The existing qstat_inc() calls are replaced by either lockevent_inc() or lockevent_cond_inc() calls. The qstat_hop() call is renamed to lockevent_pv_hop(). The "reset_counters" debugfs file is also renamed to ".reset_counts". Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-8-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events.h | 55 ++++++++++++ kernel/locking/lock_events_list.h | 50 +++++++++++ kernel/locking/qspinlock.c | 8 +- kernel/locking/qspinlock_paravirt.h | 19 +++-- kernel/locking/qspinlock_stat.h | 163 ++++++++++++++---------------------- 5 files changed, 181 insertions(+), 114 deletions(-) create mode 100644 kernel/locking/lock_events.h create mode 100644 kernel/locking/lock_events_list.h (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..4009e07b474a --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long + */ + +enum lock_events { + +#include "lock_events_list.h" + + lockevent_num, /* Total number of lock event counts */ + LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ + if (cond) + __this_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ + __this_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_QUEUED_LOCK_STAT */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..8b4d2e180475 --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name) LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ +LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ +LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ +LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ +LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ +LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ +LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ +LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ +LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ +LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ +LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5e9247dc2515..e14b32c69639 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); + lockevent_inc(lock_pending); return; /* @@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * queuing. */ queue: - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); pv_queue: node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; @@ -419,7 +419,7 @@ pv_queue: * simple enough. */ if (unlikely(idx >= MAX_NODES)) { - qstat_inc(qstat_lock_no_node, true); + lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); goto release; @@ -430,7 +430,7 @@ pv_queue: /* * Keep counts of non-zero index values: */ - qstat_inc(qstat_lock_use_node2 + idx - 1, idx); + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) if (!(val & _Q_LOCKED_PENDING_MASK) && (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); + lockevent_inc(pv_lock_stealing); return true; } if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); - qstat_hop(hopcnt); + lockevent_pv_hop(hopcnt); return &he->lock; } } @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) smp_store_mb(pn->state, vcpu_halted); if (!READ_ONCE(node->locked)) { - qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_early, wait_early); + lockevent_inc(pv_wait_node); + lockevent_cond_inc(pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ - qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); + lockevent_cond_inc(pv_spurious_wakeup, + !READ_ONCE(node->locked)); } /* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); for (;; waitcnt++) { /* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) } } WRITE_ONCE(pn->state, vcpu_hashed); - qstat_inc(qstat_pv_wait_head, true); - qstat_inc(qstat_pv_wait_again, waitcnt); + lockevent_inc(pv_wait_head); + lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); /* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ - qstat_inc(qstat_pv_kick_unlock, true); + lockevent_inc(pv_kick_unlock); pv_kick(node->cpu); } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d73f85388d5c..1db5b375fcf4 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -38,8 +38,8 @@ * Subtracting lock_use_node[234] from lock_slowpath will give you * lock_use_node1. * - * Writing to the "reset_counters" file will reset all the above counter - * values. + * Writing to the special ".reset_counts" file will reset all the above + * counter values. * * These statistical counters are implemented as per-cpu variables which are * summed and computed whenever the corresponding debugfs files are read. This @@ -48,27 +48,7 @@ * * There may be slight difference between pv_kick_wake and pv_kick_unlock. */ -enum qlock_stats { - qstat_pv_hash_hops, - qstat_pv_kick_unlock, - qstat_pv_kick_wake, - qstat_pv_latency_kick, - qstat_pv_latency_wake, - qstat_pv_lock_stealing, - qstat_pv_spurious_wakeup, - qstat_pv_wait_again, - qstat_pv_wait_early, - qstat_pv_wait_head, - qstat_pv_wait_node, - qstat_lock_pending, - qstat_lock_slowpath, - qstat_lock_use_node2, - qstat_lock_use_node3, - qstat_lock_use_node4, - qstat_lock_no_node, - qstat_num, /* Total number of statistical counters */ - qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" #ifdef CONFIG_QUEUED_LOCK_STAT /* @@ -79,99 +59,91 @@ enum qlock_stats { #include #include -static const char * const qstat_names[qstat_num + 1] = { - [qstat_pv_hash_hops] = "pv_hash_hops", - [qstat_pv_kick_unlock] = "pv_kick_unlock", - [qstat_pv_kick_wake] = "pv_kick_wake", - [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", - [qstat_pv_latency_kick] = "pv_latency_kick", - [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_stealing] = "pv_lock_stealing", - [qstat_pv_wait_again] = "pv_wait_again", - [qstat_pv_wait_early] = "pv_wait_early", - [qstat_pv_wait_head] = "pv_wait_head", - [qstat_pv_wait_node] = "pv_wait_node", - [qstat_lock_pending] = "lock_pending", - [qstat_lock_slowpath] = "lock_slowpath", - [qstat_lock_use_node2] = "lock_use_node2", - [qstat_lock_use_node3] = "lock_use_node3", - [qstat_lock_use_node4] = "lock_use_node4", - [qstat_lock_no_node] = "lock_no_node", - [qstat_reset_cnts] = "reset_counters", +#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", }; /* * Per-cpu counters */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* * Function to read and return the qlock statistical counter values * * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake * Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; - int cpu, counter, len; - u64 stat = 0, kicks = 0; + int cpu, id, len; + u64 sum = 0, kicks = 0; /* * Get the counter ID stored in file->f_inode->i_private */ - counter = (long)file_inode(file)->i_private; + id = (long)file_inode(file)->i_private; - if (counter >= qstat_num) + if (id >= lockevent_num) return -EBADF; for_each_possible_cpu(cpu) { - stat += per_cpu(qstats[counter], cpu); + sum += per_cpu(lockevents[id], cpu); /* - * Need to sum additional counter for some of them + * Need to sum additional counters for some of them */ - switch (counter) { + switch (id) { - case qstat_pv_latency_kick: - case qstat_pv_hash_hops: - kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + case LOCKEVENT_pv_latency_kick: + case LOCKEVENT_pv_hash_hops: + kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); break; - case qstat_pv_latency_wake: - kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + case LOCKEVENT_pv_latency_wake: + kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); break; } } - if (counter == qstat_pv_hash_hops) { + if (id == LOCKEVENT_pv_hash_hops) { u64 frac = 0; if (kicks) { - frac = 100ULL * do_div(stat, kicks); + frac = 100ULL * do_div(sum, kicks); frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); } /* * Return a X.XX decimal number */ - len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", + sum, frac); } else { /* * Round to the nearest ns */ - if ((counter == qstat_pv_latency_kick) || - (counter == qstat_pv_latency_wake)) { + if ((id == LOCKEVENT_pv_latency_kick) || + (id == LOCKEVENT_pv_latency_wake)) { if (kicks) - stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); } - len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); } return simple_read_from_buffer(user_buf, count, ppos, buf, len); @@ -180,11 +152,9 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, /* * Function to handle write request * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. + * When id = .reset_cnts, reset all the counter values. */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { int cpu; @@ -192,14 +162,14 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if ((long)file_inode(file)->i_private != qstat_reset_cnts) + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) return count; for_each_possible_cpu(cpu) { int i; - unsigned long *ptr = per_cpu_ptr(qstats, cpu); + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); - for (i = 0 ; i < qstat_num; i++) + for (i = 0 ; i < lockevent_num; i++) WRITE_ONCE(ptr[i], 0); } return count; @@ -208,9 +178,9 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Debugfs data structures */ -static const struct file_operations fops_qstat = { - .read = qstat_read, - .write = qstat_write, +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, .llseek = default_llseek, }; @@ -219,10 +189,10 @@ static const struct file_operations fops_qstat = { */ static int __init init_qspinlock_stat(void) { - struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); + struct dentry *d_counts = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) + if (!d_counts) goto out; /* @@ -232,39 +202,31 @@ static int __init init_qspinlock_stat(void) * root is allowed to do the read/write to limit impact to system * performance. */ - for (i = 0; i < qstat_num; i++) - if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat)) + for (i = 0; i < lockevent_num; i++) + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) goto fail_undo; - if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat)) + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) goto fail_undo; return 0; fail_undo: - debugfs_remove_recursive(d_qstat); + debugfs_remove_recursive(d_counts); out: pr_warn("Could not create 'qlockstat' debugfs entries\n"); return -ENOMEM; } fs_initcall(init_qspinlock_stat); -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ - if (cond) - this_cpu_inc(qstats[stat]); -} - /* * PV hash hop count */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt) { - this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); + this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); } /* @@ -276,7 +238,7 @@ static inline void __pv_kick(int cpu) per_cpu(pv_kick_time, cpu) = start; pv_kick(cpu); - this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); + this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); } /* @@ -289,9 +251,9 @@ static inline void __pv_wait(u8 *ptr, u8 val) *pkick_time = 0; pv_wait(ptr, val); if (*pkick_time) { - this_cpu_add(qstats[qstat_pv_latency_wake], + this_cpu_add(EVENT_COUNT(pv_latency_wake), sched_clock() - *pkick_time); - qstat_inc(qstat_pv_kick_wake, true); + lockevent_inc(pv_kick_wake); } } @@ -300,7 +262,6 @@ static inline void __pv_wait(u8 *ptr, u8 val) #else /* CONFIG_QUEUED_LOCK_STAT */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -static inline void qstat_hop(int hopcnt) { } +static inline void lockevent_pv_hop(int hopcnt) { } #endif /* CONFIG_QUEUED_LOCK_STAT */ -- cgit v1.2.3 From fb346fd9fc081c3d978c3f3d26d39334527a2662 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:17 -0400 Subject: locking/lock_events: Make lock_events available for all archs & other locks The QUEUED_LOCK_STAT option to report queued spinlocks event counts was previously allowed only on x86 architecture. To make the locking event counting code more useful, it is now renamed to a more generic LOCK_EVENT_COUNTS config option. This new option will be available to all the architectures that use qspinlock at the moment. Other locking code can now start to use the generic locking event counting code by including lock_events.h and put the new locking event names into the lock_events_list.h header file. My experience with lock event counting is that it gives valuable insight on how the locking code works and what can be done to make it better. I would like to extend this benefit to other locking code like mutex and rwsem in the near future. The PV qspinlock specific code will stay in qspinlock_stat.h. The locking event counters will now reside in the /lock_event_counts directory. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-9-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 1 + kernel/locking/lock_events.c | 153 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/lock_events.h | 10 ++- kernel/locking/qspinlock_stat.h | 141 ++++-------------------------------- 4 files changed, 173 insertions(+), 132 deletions(-) create mode 100644 kernel/locking/lock_events.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 1af83e9ce57d..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..71c36d1fb834 --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long + */ + +/* + * Collect locking event counts + */ +#include +#include +#include +#include + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the /lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (!d_counts) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) + goto fail_undo; + + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 4009e07b474a..feb1acc54611 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -13,6 +13,9 @@ * Authors: Waiman Long */ +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + enum lock_events { #include "lock_events_list.h" @@ -21,7 +24,7 @@ enum lock_events { LOCKEVENT_reset_cnts = lockevent_num, }; -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS /* * Per-cpu counters */ @@ -46,10 +49,11 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#else /* CONFIG_LOCK_EVENT_COUNTS */ #define lockevent_inc(ev) #define lockevent_add(ev, c) #define lockevent_cond_inc(ev, c) -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 1db5b375fcf4..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -9,76 +9,29 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Waiman Long + * Authors: Waiman Long */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * /qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * lock_pending - # of locking operations via pending code - * lock_slowpath - # of locking operations via MCS lock queue - * lock_use_node2 - # of locking operations that use 2nd per-CPU node - * lock_use_node3 - # of locking operations that use 3rd per-CPU node - * lock_use_node4 - # of locking operations that use 4th per-CPU node - * lock_no_node - # of locking operations without using per-CPU node - * - * Subtracting lock_use_node[234] from lock_slowpath will give you - * lock_use_node1. - * - * Writing to the special ".reset_counts" file will reset all the above - * counter values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ #include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include #include #include #include #define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] -#undef LOCK_EVENT -#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, - -static const char * const lockevent_names[lockevent_num + 1] = { - -#include "lock_events_list.h" - - [LOCKEVENT_reset_cnts] = ".reset_counts", -}; - /* - * Per-cpu counters + * PV specific per-cpu counter */ -DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: * 1. pv_latency_kick @@ -88,8 +41,8 @@ static DEFINE_PER_CPU(u64, pv_kick_time); * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t lockevent_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; int cpu, id, len; @@ -149,78 +102,6 @@ static ssize_t lockevent_read(struct file *file, char __user *user_buf, return simple_read_from_buffer(user_buf, count, ppos, buf, len); } -/* - * Function to handle write request - * - * When id = .reset_cnts, reset all the counter values. - */ -static ssize_t lockevent_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(lockevents, cpu); - - for (i = 0 ; i < lockevent_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_lockevent = { - .read = lockevent_read, - .write = lockevent_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_counts = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_counts) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < lockevent_num; i++) - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) - goto fail_undo; - - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, - d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_counts); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - /* * PV hash hop count */ @@ -260,8 +141,10 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ -- cgit v1.2.3 From bf20616f46e536fe8affed6f138db4b3040b55a6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:18 -0400 Subject: locking/lock_events: Don't show pvqspinlock events on bare metal On bare metal, the pvqspinlock event counts will always be 0. So there is no point in showing their corresponding debugfs files. So they are skipped in this case. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-10-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c index 71c36d1fb834..fa2c2f951c6b 100644 --- a/kernel/locking/lock_events.c +++ b/kernel/locking/lock_events.c @@ -115,6 +115,29 @@ static const struct file_operations fops_lockevent = { .llseek = default_llseek, }; +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include + +static bool __init skip_lockevent(const char *name) +{ + static int pv_on __initdata = -1; + + if (pv_on < 0) + pv_on = !pv_is_native_spin_unlock(); + /* + * Skip PV qspinlock events on bare metal. + */ + if (!pv_on && !memcmp(name, "pv_", 3)) + return true; + return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ + return false; +} +#endif + /* * Initialize debugfs for the locking event counts. */ @@ -133,10 +156,13 @@ static int __init init_lockevent_counts(void) * root is allowed to do the read/write to limit impact to system * performance. */ - for (i = 0; i < lockevent_num; i++) + for (i = 0; i < lockevent_num; i++) { + if (skip_lockevent(lockevent_names[i])) + continue; if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, (void *)(long)i, &fops_lockevent)) goto fail_undo; + } if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, d_counts, (void *)(long)LOCKEVENT_reset_cnts, -- cgit v1.2.3 From a8654596f0371c2604c4d475422c48f4fc6a56c9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:19 -0400 Subject: locking/rwsem: Enable lock event counting Add lock event counting calls so that we can track the number of lock events happening in the rwsem code. With CONFIG_LOCK_EVENT_COUNTS on and booting a 4-socket 112-thread x86-64 system, the rwsem counts after system bootup were as follows: rwsem_opt_fail=261 rwsem_opt_wlock=50636 rwsem_rlock=445 rwsem_rlock_fail=0 rwsem_rlock_fast=22 rwsem_rtrylock=810144 rwsem_sleep_reader=441 rwsem_sleep_writer=310 rwsem_wake_reader=355 rwsem_wake_writer=2335 rwsem_wlock=261 rwsem_wlock_fail=0 rwsem_wtrylock=20583 It can be seen that most of the lock acquisitions in the slowpath were write-locks in the optimistic spinning code path with no sleeping at all. For this system, over 97% of the locks are acquired via optimistic spinning. It illustrates the importance of optimistic spinning in improving the performance of rwsem. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-11-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 17 +++++++++++++++++ kernel/locking/rwsem-xadd.c | 11 +++++++++++ kernel/locking/rwsem.h | 4 ++++ 3 files changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 8b4d2e180475..ad7668cfc9da 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -48,3 +48,20 @@ LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ +LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ +LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ +LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ +LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ +LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ +LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index f6198e1a58f6..6b3ee9948bf1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * will notice the queued writer. */ wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); } return; @@ -214,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); if (list_empty(&sem->wait_list)) { /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; @@ -265,6 +267,7 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, count + RWSEM_ACTIVE_WRITE_BIAS)) { rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); return true; } } @@ -389,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) osq_unlock(&sem->osq); done: preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } @@ -436,6 +440,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); return sem; } adjustment += RWSEM_WAITING_BIAS; @@ -472,9 +477,11 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) break; } schedule(); + lockevent_inc(rwsem_sleep_reader); } __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); return sem; out_nolock: list_del(&waiter.list); @@ -482,6 +489,7 @@ out_nolock: atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); return ERR_PTR(-EINTR); } @@ -575,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) goto out_nolock; schedule(); + lockevent_inc(rwsem_sleep_writer); set_current_state(state); } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); @@ -583,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); return ret; @@ -596,6 +606,7 @@ out_nolock: __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 3059a2dc39f8..37db17890e36 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -23,6 +23,8 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. */ +#include "lock_events.h" + #define RWSEM_READER_OWNED (1UL << 0) #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) @@ -200,6 +202,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ long tmp = RWSEM_UNLOCKED_VALUE; + lockevent_inc(rwsem_rtrylock); do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_ACTIVE_READ_BIAS)) { @@ -241,6 +244,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) { long tmp; + lockevent_inc(rwsem_wtrylock); tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); if (tmp == RWSEM_UNLOCKED_VALUE) { -- cgit v1.2.3 From ee6a6500fe1f5c5a3f18de33fe0178a3c627f6d0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 10 Apr 2019 10:45:38 -0400 Subject: ftrace: Remove ASSIGN_OPS_HASH() macro from ftrace.c The ASSIGN_OPS_HASH() macro was moved to fgraph.c where it was used, but for some reason it wasn't removed from ftrace.c, as it is no longer referenced there. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26c8ca9bd06b..bf11e0553450 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,12 +69,8 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#define ASSIGN_OPS_HASH(opsname, val) \ - .func_hash = val, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) -#define ASSIGN_OPS_HASH(opsname, val) #endif enum { -- cgit v1.2.3 From 83ca259489409a1fe8a83dad83a82f32174d4f31 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Apr 2019 09:15:25 +0800 Subject: swiotlb: dump used and total slots when swiotlb buffer is full So far the kernel only prints the requested size if swiotlb buffer if full. It is not possible to know whether it is simply an out of buffer, or it is because swiotlb cannot allocate buffer with the requested size due to fragmentation. As 'io_tlb_used' is available since commit 71602fe6d4e9 ("swiotlb: add debugfs to track swiotlb buffer usage"), both 'io_tlb_used' and 'io_tlb_nslabs' are printed when swiotlb buffer is full. Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2b0c8fd9658e..82c767374c70 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -533,7 +533,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n", + size, io_tlb_nslabs, io_tlb_used); return DMA_MAPPING_ERROR; found: io_tlb_used += nslots; -- cgit v1.2.3 From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 438199e2eca4..d995eedfdd16 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2009,7 +2009,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2022,6 +2022,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit v1.2.3 From 2fa717a0337e7acafda9283c938b635191b8036b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 11 Apr 2019 11:46:13 -0400 Subject: ftrace: Do not process STUB functions in ftrace_ops_list_func() The function_graph tracer has a stub function and its ops flag has the FTRACE_OPS_FL_STUB set. As the function graph does not use the ftrace_ops->func pointer but instead is called by a separate part of the ftrace trampoline. The function_graph tracer still requires to pass in a ftrace_ops that may also hold the hash of the functions to call. But there's no reason to test that hash in the function tracing portion. Instead of testing to see if we should call the stub function, just test if the ops has FTRACE_OPS_FL_STUB set, and just skip it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf11e0553450..433a64f49532 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6260,6 +6260,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Stub functions don't need to be called nor tested */ + if (op->flags & FTRACE_OPS_FL_STUB) + continue; /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true -- cgit v1.2.3 From b1cd609d9b517f01867c211bd520cc805db3068a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Tue, 12 Mar 2019 09:27:09 -0700 Subject: bpf: Add base proto function for cgroup-bpf programs Currently kernel/bpf/cgroup.c contains only one program type and one proto function cgroup_dev_func_proto(). It'd be useful to have base proto function that can be reused for new cgroup-bpf program types coming soon. Introduce cgroup_base_func_proto(). Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..f6cd38746df2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -701,7 +701,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -725,6 +725,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit v1.2.3 From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 12:59:24 -0800 Subject: bpf: Sysctl hook Containerized applications may run as root and it may create problems for whole host. Specifically such applications may change a sysctl and affect applications in other containers. Furthermore in existing infrastructure it may not be possible to just completely disable writing to sysctl, instead such a process should be gradual with ability to log what sysctl are being changed by a container, investigate, limit the set of writable sysctl to currently used ones (so that new ones can not be changed) and eventually reduce this set to zero. The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis. New program type has access to following minimal context: struct bpf_sysctl { __u32 write; }; Where @write indicates whether sysctl is being read (= 0) or written (= 1). Helpers to access sysctl name and value will be introduced separately. BPF_CGROUP_SYSCTL attach point is added to sysctl code right before passing control to ctl_table->proc_handler so that BPF program can either allow or deny access to sysctl. Suggested-by: Roman Gushchin Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++ kernel/bpf/verifier.c | 1 + 3 files changed, 100 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f6cd38746df2..610491b5f0aa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -768,3 +770,93 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + }; + struct cgroup *cgrp; + int ret; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d995eedfdd16..92c9b8a32b50 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1888,6 +1888,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1966,6 +1969,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1999,6 +2005,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f25b7c9c20ba..20808e3c95a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5267,6 +5267,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; -- cgit v1.2.3 From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 13:28:48 -0800 Subject: bpf: Introduce bpf_sysctl_get_name helper Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry) into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem". If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied, e.g. "tcp_mem". Documentation for the new helper is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 610491b5f0aa..a68387043244 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -806,10 +807,77 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return cgroup_base_func_proto(func_id, prog); + switch (func_id) { + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } } static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, -- cgit v1.2.3 From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Feb 2019 19:22:15 -0800 Subject: bpf: Introduce bpf_sysctl_get_current_value helper Add bpf_sysctl_get_current_value() helper to copy current sysctl value into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. It provides same string as user space can see by reading corresponding file in /proc/sys/, including new line, etc. Documentation for the new helper is provided in bpf.h UAPI. Since current value is kept in ctl_table->data in a parsed form, ctl_table->proc_handler() with write=0 is called to read that data and convert it to a string. Such a string can later be parsed by a program using helpers that will be introduced separately. Unfortunately it's not trivial to provide API to access parsed data due to variety of data representations (string, intvec, uintvec, ulongvec, custom structures, even NULL, etc). Instead it's assumed that user know how to handle specific sysctl they're interested in and appropriate helpers can be used. Since ctl_table->proc_handler() expects __user buffer, conversion to __user happens for kernel allocated one where the value is stored. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a68387043244..c6b2cf29a54b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -794,15 +794,37 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .head = head, .table = table, .write = write, + .cur_val = NULL, + .cur_len = PAGE_SIZE, }; struct cgroup *cgrp; int ret; + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); rcu_read_unlock(); + kfree(ctx.cur_val); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -869,12 +891,55 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .arg4_type = ARG_ANYTHING, }; +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:38:43 -0800 Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers Add helpers to work with new value being written to sysctl by user space. bpf_sysctl_get_new_value() copies value being written to sysctl into provided buffer. bpf_sysctl_set_new_value() overrides new value being written by user space with a one from provided buffer. Buffer should contain string representation of the value, similar to what can be seen in /proc/sys/. Both helpers can be used only on sysctl write. File position matters and can be managed by an interface that will be introduced separately. E.g. if user space calls sys_write to a file in /proc/sys/ at file position = X, where X > 0, then the value set by bpf_sysctl_set_new_value() will be written starting from X. If program wants to override whole value with specified buffer, file position has to be set to zero. Documentation for the new helpers is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c6b2cf29a54b..ba4e21986760 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -778,6 +778,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -788,7 +795,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type) + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, @@ -796,6 +804,9 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .write = write, .cur_val = NULL, .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, }; struct cgroup *cgrp; int ret; @@ -818,6 +829,18 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, ctx.cur_len = 0; } + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min(PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); @@ -825,6 +848,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -932,6 +962,51 @@ static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -940,6 +1015,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:50:52 -0800 Subject: bpf: Add file_pos field to bpf_sysctl ctx Add file_pos field to bpf_sysctl context to read and write sysctl file position at which sysctl is being accessed (read or written). The field can be used to e.g. override whole sysctl value on write to sysctl even when sys_write is called by user space with file_pos > 0. Or BPF program may reject such accesses. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba4e21986760..b2adf22139b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -782,6 +782,9 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise * @new_buf: pointer to pointer to new buffer that will be allocated if program * overrides new value provided by user space on sysctl write * NOTE: it's caller responsibility to free *new_buf if it was set @@ -796,12 +799,14 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type) + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, .table = table, .write = write, + .ppos = ppos, .cur_val = NULL, .cur_len = PAGE_SIZE, .new_val = NULL, @@ -1030,14 +1035,22 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, { const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sysctl) || - off % size || type != BPF_READ) + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) return false; switch (off) { case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } default: return false; } @@ -1059,6 +1072,41 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, write), target_size)); break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; } return insn - insn_buf; -- cgit v1.2.3 From 57c3bb725a3dd97d960d7e1cd0845d88de53217f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 16:57:10 -0700 Subject: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types Currently the way to pass result from BPF helper to BPF program is to provide memory area defined by pointer and size: func(void *, size_t). It works great for generic use-case, but for simple types, such as int, it's overkill and consumes two arguments when it could use just one. Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be able to pass result from helper to program via pointer to int and long correspondingly: func(int *) or func(long *). New argument types are similar to ARG_PTR_TO_MEM with the following differences: * they don't require corresponding ARG_CONST_SIZE argument, predefined access sizes are used instead (32bit for int, 64bit for long); * it's possible to use more than one such an argument in a helper; * provided pointers have to be aligned. It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT argument types. It's not done due to lack of use-case though. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20808e3c95a8..15ab6fa817ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2462,6 +2462,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2554,6 +2570,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2635,6 +2657,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; -- cgit v1.2.3 From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 17:55:26 -0700 Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned long correspondingly. It's similar to user space strtol(3) and strtoul(3) with a few changes to the API: * instead of NUL-terminated C string the helpers expect buffer and buffer length; * resulting long or unsigned long is returned in a separate result-argument; * return value is used to indicate success or failure, on success number of consumed bytes is returned that can be used to identify position to read next if the buffer is expected to contain multiple integers; * instead of *base* argument, *flags* is used that provides base in 5 LSB, other bits are reserved for future use; * number of supported bases is limited. Documentation for the new helpers is provided in bpf.h UAPI. The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to be able to convert string input to e.g. "ulongvec" output. E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be parsed by calling to bpf_strtoul three times. Implementation notes: Implementation includes "../../lib/kstrtox.h" to reuse integer parsing functions. It's done exactly same way as fs/proc/base.c already does. Unfortunately existing kstrtoX function can't be used directly since they fail if any invalid character is present right after integer in the string. Existing simple_strtoX functions can't be used either since they're obsolete and don't handle overflow properly. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 4 ++ kernel/bpf/helpers.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2adf22139b3..789d4ab2336e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1016,6 +1016,10 @@ static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif -- cgit v1.2.3 From 51356ac89b5a15e5207e8740d5f4f8b71cb7332f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 12 Apr 2019 16:01:01 -0700 Subject: bpf: Fix distinct pointer types warning for ARCH=i386 Fix a new warning reported by kbuild for make ARCH=i386: In file included from kernel/bpf/cgroup.c:11:0: kernel/bpf/cgroup.c: In function '__cgroup_bpf_run_filter_sysctl': include/linux/kernel.h:827:29: warning: comparison of distinct pointer types lacks a cast (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) ^ include/linux/kernel.h:841:4: note: in expansion of macro '__typecheck' (__typecheck(x, y) && __no_side_effects(x, y)) ^~~~~~~~~~~ include/linux/kernel.h:851:24: note: in expansion of macro '__safe_cmp' __builtin_choose_expr(__safe_cmp(x, y), \ ^~~~~~~~~~ include/linux/kernel.h:860:19: note: in expansion of macro '__careful_cmp' #define min(x, y) __careful_cmp(x, y, <) ^~~~~~~~~~~~~ >> kernel/bpf/cgroup.c:837:17: note: in expansion of macro 'min' ctx.new_len = min(PAGE_SIZE, *pcount); ^~~ Fixes: 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers") Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 789d4ab2336e..e58a6c247f56 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -839,7 +839,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); - ctx.new_len = min(PAGE_SIZE, *pcount); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); if (!ctx.new_val || copy_from_user(ctx.new_val, buf, ctx.new_len)) /* Let BPF program decide how to proceed. */ -- cgit v1.2.3 From 53b29c336830db48ad3dc737f88b8c065b1f0851 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 12 Apr 2019 19:38:26 +0800 Subject: swiotlb: save io_tlb_used to local variable before leaving critical section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swiotlb is full, the kernel would print io_tlb_used. However, the result might be inaccurate at that time because we have left the critical section protected by spinlock. Therefore, we backup the io_tlb_used into local variable before leaving critical section. Fixes: 83ca25948940 ("swiotlb: dump used and total slots when swiotlb buffer is full") Suggested-by: Håkon Bugge Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 82c767374c70..38d57218809c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -445,6 +445,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, unsigned long mask; unsigned long offset_slots; unsigned long max_slots; + unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -531,10 +532,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, } while (index != wrap); not_found: + tmp_io_tlb_used = io_tlb_used; + spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n", - size, io_tlb_nslabs, io_tlb_used); + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + size, io_tlb_nslabs, tmp_io_tlb_used); return DMA_MAPPING_ERROR; found: io_tlb_used += nslots; -- cgit v1.2.3 From 1b04aee7e2182454a663950e68084fa5ada9625a Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:34 +0100 Subject: bpf: refactor propagate_liveness to eliminate duplicated for loop Propagation for register and stack slot are finished in separate for loop, while they are perfect to be put into a single loop. This could also let them share some common variables in later patches. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ab6fa817ce..da285df492fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6254,10 +6254,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, return err; } } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { + /* Propagate stack slots. */ state = vstate->frame[frame]; parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && -- cgit v1.2.3 From 3f8cafa4131f67d47c8de326c7dcd561cc65fb38 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:35 +0100 Subject: bpf: refactor propagate_liveness to eliminate code redundance Access to reg states were not factored out, the consequence is long code for dereferencing them which made the indentation not good for reading. This patch factor out these code so the core code in the loop could be easier to follow. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da285df492fd..6fd36a8ba1a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6232,8 +6232,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6243,28 +6244,33 @@ static int propagate_liveness(struct bpf_verifier_env *env, /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + if (parent_reg[i].live & REG_LIVE_READ) continue; - if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[frame]->regs[i], - &vparent->frame[frame]->regs[i]); - if (err) - return err; - } + if (!(state_reg[i].live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + if (err) + return err; } /* Propagate stack slots. */ - state = vstate->frame[frame]; - parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + if (parent_reg->live & REG_LIVE_READ) continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + if (!(state_reg->live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, state_reg, parent_reg); + if (err) + return err; } } return err; -- cgit v1.2.3 From 55e7f3b5ac94a8c9f2e35961a45e9aa526a9e41d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:36 +0100 Subject: bpf: factor out reg and stack slot propagation into "propagate_liveness_reg" After code refactor in previous patches, the propagation logic inside the for loop in "propagate_liveness" becomes clear that they are good enough to be factored out into a common function "propagate_liveness_reg". Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6fd36a8ba1a0..3fdb301c4f8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6221,6 +6221,22 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + int err; + + if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + return 0; + + err = mark_reg_read(env, reg, parent_reg); + if (err) + return err; + + return 0; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6250,11 +6266,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (parent_reg[i].live & REG_LIVE_READ) - continue; - if (!(state_reg[i].live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); if (err) return err; } @@ -6264,11 +6277,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, i < parent->allocated_stack / BPF_REG_SIZE; i++) { parent_reg = &parent->stack[i].spilled_ptr; state_reg = &state->stack[i].spilled_ptr; - if (parent_reg->live & REG_LIVE_READ) - continue; - if (!(state_reg->live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, state_reg, parent_reg); + err = propagate_liveness_reg(env, state_reg, + parent_reg); if (err) return err; } -- cgit v1.2.3 From c342dc109aa5a4f0bb36335cb441aaafc98b98ef Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:37 +0100 Subject: bpf: refactor "check_reg_arg" to eliminate code redundancy There are a few "regs[regno]" here are there across "check_reg_arg", this patch factor it out into a simple "reg" pointer. The intention is to simplify code indentation and make the later patches in this set look cleaner. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3fdb301c4f8c..c7220153c5b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1177,30 +1177,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + return mark_reg_read(env, reg, reg->parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } -- cgit v1.2.3 From 26536e7c242e2b0f73c25c46fc50d2525ebe400b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 13 Apr 2019 13:22:44 -0400 Subject: locking/rwsem: Prevent unneeded warning during locking selftest Disable the DEBUG_RWSEMS check when locking selftest is running with debug_locks_silent flag set. Signed-off-by: Waiman Long Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: http://lkml.kernel.org/r/20190413172259.2740-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 37db17890e36..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -30,7 +30,8 @@ #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ - if (WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ (long)((sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ -- cgit v1.2.3 From 2dfed4565afe263751d2451ad22336ad806c25a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2019 12:28:04 +0200 Subject: lockdep: Remove the ULONG_MAX stack trace hackery No architecture terminates the stack trace with ULONG_MAX anymore. Remove the cruft. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Will Deacon Link: https://lkml.kernel.org/r/20190410103644.485737321@linutronix.de --- kernel/locking/lockdep.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e16766ff184b..2edf9501d906 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -444,17 +444,6 @@ static int save_trace(struct stack_trace *trace) save_stack_trace(trace); - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; - trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; -- cgit v1.2.3 From accddc41b96915ab4e5d37796c6d17d70805999c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2019 12:28:08 +0200 Subject: latency_top: Remove the ULONG_MAX stack trace hackery No architecture terminates the stack trace with ULONG_MAX anymore. The consumer terminates on the first zero entry or at the number of entries, so no functional change. Remove the cruft. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Link: https://lkml.kernel.org/r/20190410103644.853527514@linutronix.de --- kernel/latencytop.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..f5a90ab3c6b9 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk, break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry marks end of backtrace: */ + if (!record) break; } if (same) { @@ -210,8 +210,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry is end of backtrace */ + if (!record) break; } if (same) { @@ -252,10 +252,10 @@ static int lstats_show(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; + seq_printf(m, " %ps", (void *)bt); } seq_puts(m, "\n"); -- cgit v1.2.3 From 4285f2fcef8001ead0f1c9315ba50302cab68cda Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Apr 2019 12:28:10 +0200 Subject: tracing: Remove the ULONG_MAX stack trace hackery No architecture terminates the stack trace with ULONG_MAX anymore. As the code checks the number of entries stored anyway there is no point in keeping all that ULONG_MAX magic around. The histogram code zeroes the storage before saving the stack, so if the trace is shorter than the maximum number of entries it can terminate the print loop if a zero entry is detected. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Link: https://lkml.kernel.org/r/20190410103645.048761764@linutronix.de --- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_stack.c | 20 +++++--------------- 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 795aa2038377..21ceae299f7e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5246,7 +5246,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned int i; for (i = 0; i < max_entries; i++) { - if (stacktrace_entries[i] == ULONG_MAX) + if (!stacktrace_entries[i]) return; seq_printf(m, "%*c", 1 + spaces, ' '); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..c6e54ff25cae 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,8 +18,7 @@ #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = - { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES + 1]; unsigned stack_trace_index[STACK_TRACE_ENTRIES]; /* @@ -52,10 +51,7 @@ void stack_trace_print(void) stack_trace_max.nr_entries); for (i = 0; i < stack_trace_max.nr_entries; i++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + if (i + 1 == stack_trace_max.nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -150,8 +146,6 @@ check_stack(unsigned long ip, unsigned long *stack) p = start; for (; p < top && i < stack_trace_max.nr_entries; p++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; /* * The READ_ONCE_NOCHECK is used to let KASAN know that * this is not a stack-out-of-bounds error. @@ -183,8 +177,6 @@ check_stack(unsigned long ip, unsigned long *stack) } stack_trace_max.nr_entries = x; - for (; x < i; x++) - stack_dump_trace[x] = ULONG_MAX; if (task_stack_end_corrupted(current)) { stack_trace_print(); @@ -286,7 +278,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_max.nr_entries) return NULL; m->private = (void *)n; @@ -360,12 +352,10 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= stack_trace_max.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (i >= stack_trace_max.nr_entries) return 0; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + if (i + 1 == stack_trace_max.nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; -- cgit v1.2.3 From 2d87a0674bd60d855e4008e2d84f5b23d7cb9b7d Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 10 Apr 2019 11:14:19 +0200 Subject: timekeeping: Audit clock adjustments Emit an audit record whenever the system clock is changed (i.e. shifted by a non-zero offset) by a syscall from userspace. The syscalls than can (at the time of writing) trigger such record are: - settimeofday(2), stime(2), clock_settime(2) -- via do_settimeofday64() - adjtimex(2), clock_adjtime(2) -- via do_adjtimex() The new records have type AUDIT_TIME_INJOFFSET and contain the following fields: - sec -- the 'seconds' part of the offset - nsec -- the 'nanoseconds' part of the offset Example record (time was shifted backwards by ~15.875 seconds): type=TIME_INJOFFSET msg=audit(1530616049.652:13): sec=-16 nsec=124887145 The records of this type will be associated with the corresponding syscall records. Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Reviewed-by: Thomas Gleixner [PM: fixed a line width problem in __audit_tk_injoffset()] Signed-off-by: Paul Moore --- kernel/auditsc.c | 7 +++++++ kernel/time/timekeeping.c | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 51a2ceb3a1ca..3843495d0083 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2512,6 +2512,13 @@ void __audit_fanotify(unsigned int response) AUDIT_FANOTIFY, "resp=%u", response); } +void __audit_tk_injoffset(struct timespec64 offset) +{ + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, + "sec=%lli nsec=%li", + (long long)offset.tv_sec, offset.tv_nsec); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..3d24be4cd607 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -1250,6 +1251,9 @@ out: /* signal hrtimers about time change */ clock_was_set(); + if (!ret) + audit_tk_injoffset(ts_delta); + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -2322,6 +2326,8 @@ int do_adjtimex(struct __kernel_timex *txc) ret = timekeeping_inject_offset(&delta); if (ret) return ret; + + audit_tk_injoffset(delta); } ktime_get_real_ts64(&ts); -- cgit v1.2.3 From 7e8eda734d30de81d06a949c9bf9853c445ede4e Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 10 Apr 2019 11:14:20 +0200 Subject: ntp: Audit NTP parameters adjustment Emit an audit record every time selected NTP parameters are modified from userspace (via adjtimex(2) or clock_adjtime(2)). These parameters may be used to indirectly change system clock, and thus their modifications should be audited. Such events will now generate records of type AUDIT_TIME_ADJNTPVAL containing the following fields: - op -- which value was adjusted: - offset -- corresponding to the time_offset variable - freq -- corresponding to the time_freq variable - status -- corresponding to the time_status variable - adjust -- corresponding to the time_adjust variable - tick -- corresponding to the tick_usec variable - tai -- corresponding to the timekeeping's TAI offset - old -- the old value - new -- the new value Example records: type=TIME_ADJNTPVAL msg=audit(1530616044.507:7): op=status old=64 new=8256 type=TIME_ADJNTPVAL msg=audit(1530616044.511:11): op=freq old=0 new=49180377088000 The records of this type will be associated with the corresponding syscall records. An overview of parameter changes that can be done via do_adjtimex() (based on information from Miroslav Lichvar) and whether they are audited: __timekeeping_set_tai_offset() -- sets the offset from the International Atomic Time (AUDITED) NTP variables: time_offset -- can adjust the clock by up to 0.5 seconds per call and also speed it up or slow down by up to about 0.05% (43 seconds per day) (AUDITED) time_freq -- can speed up or slow down by up to about 0.05% (AUDITED) time_status -- can insert/delete leap seconds and it also enables/ disables synchronization of the hardware real-time clock (AUDITED) time_maxerror, time_esterror -- change error estimates used to inform userspace applications (NOT AUDITED) time_constant -- controls the speed of the clock adjustments that are made when time_offset is set (NOT AUDITED) time_adjust -- can temporarily speed up or slow down the clock by up to 0.05% (AUDITED) tick_usec -- a more extreme version of time_freq; can speed up or slow down the clock by up to 10% (AUDITED) Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Reviewed-by: Thomas Gleixner Signed-off-by: Paul Moore --- kernel/auditsc.c | 22 ++++++++++++++++++++++ kernel/time/ntp.c | 22 +++++++++++++++++++--- kernel/time/ntp_internal.h | 4 +++- kernel/time/timekeeping.c | 7 ++++++- 4 files changed, 50 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3843495d0083..5371b59bde36 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2519,6 +2519,28 @@ void __audit_tk_injoffset(struct timespec64 offset) (long long)offset.tv_sec, offset.tv_nsec); } +static void audit_log_ntp_val(const struct audit_ntp_data *ad, + const char *op, enum audit_ntp_type type) +{ + const struct audit_ntp_val *val = &ad->vals[type]; + + if (val->newval == val->oldval) + return; + + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, + "op=%s old=%lli new=%lli", op, val->oldval, val->newval); +} + +void __audit_ntp_log(const struct audit_ntp_data *ad) +{ + audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); + audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); + audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); + audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); + audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); + audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..ac5555e25733 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -709,7 +710,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai) + s32 *time_tai, struct audit_ntp_data *ad) { int result; @@ -720,14 +721,29 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; ntp_update_frequency(); + + audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); } txc->offset = save_adjust; } else { - /* If there are input parameters, then process them: */ - if (txc->modes) + if (txc->modes) { + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + process_adjtimex_modes(txc, time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 40e6122e634e..908ecaa65fc3 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,8 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, + const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d24be4cd607..f366f2fdf1b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2307,6 +2307,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; + struct audit_ntp_data ad; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; @@ -2330,13 +2331,15 @@ int do_adjtimex(struct __kernel_timex *txc) audit_tk_injoffset(delta); } + audit_ntp_init(&ad); + ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai); + ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); @@ -2347,6 +2350,8 @@ int do_adjtimex(struct __kernel_timex *txc) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + audit_ntp_log(&ad); + /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) timekeeping_advance(TK_ADV_FREQ); -- cgit v1.2.3 From 02a8c817a31606b6b37c2b755f6569903f44241e Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Sun, 14 Apr 2019 18:58:46 +0200 Subject: bpf: add map helper functions push, pop, peek in more BPF programs commit f1a2e44a3aec ("bpf: add queue and stack maps") introduced new BPF helper functions: - BPF_FUNC_map_push_elem - BPF_FUNC_map_pop_elem - BPF_FUNC_map_peek_elem but they were made available only for network BPF programs. This patch makes them available for tracepoint, cgroup and lirc programs. Signed-off-by: Alban Crequy Cc: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 6 ++++++ kernel/trace/bpf_trace.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e58a6c247f56..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -713,6 +713,12 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..91800be0c8eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -569,6 +569,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: -- cgit v1.2.3 From c68d224e5ed15605e651e2482c6ffd95915ddf58 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Apr 2019 10:32:51 -0700 Subject: perf/core: Add perf_pmu_resched() as global function This patch add perf_pmu_resched() a global function that can be called to force rescheduling of events for a given PMU. The function locks both cpuctx and task_ctx internally. This will be used by a subsequent patch. Signed-off-by: Stephane Eranian [ Simplified the calling convention. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Cc: nelson.dsouza@intel.com Cc: tonyj@suse.com Link: https://lkml.kernel.org/r/20190408173252.37932-2-eranian@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 30a572e4c6f1..abbd4b3b96c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2478,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_pmu_enable(cpuctx->ctx.pmu); } +void perf_pmu_resched(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + perf_ctx_unlock(cpuctx, task_ctx); +} + /* * Cross CPU call to install and enable a performance event * -- cgit v1.2.3 From e2abb398115e9c33f3d1e25bf6d1d08badc58b13 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2019 15:06:21 +0100 Subject: sched/fair: Remove unneeded prototype of capacity_of() The prototype of that function was already hoisted up in: commit 3b1baa6496e6 ("sched/fair: Add 'group_misfit_task' load-balance type") but that seems to have been missed. Get rid of the extra prototype. Signed-off-by: Valentin Schneider Acked-by: Quentin Perret Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: morten.rasmussen@arm.com Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator") Link: http://lkml.kernel.org/r/20190416140621.19884-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ed7f5f8107b7..b6cc0703b850 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5116,7 +5116,6 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu); static inline bool cpu_overutilized(int cpu) { -- cgit v1.2.3 From 6d25be5782e482eb93e3de0c94d0a517879377d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Mar 2019 17:55:48 +0100 Subject: sched/core, workqueues: Distangle worker accounting from rq lock The worker accounting for CPU bound workers is plugged into the core scheduler code and the wakeup code. This is not a hard requirement and can be avoided by keeping track of the state in the workqueue code itself. Keep track of the sleeping state in the worker itself and call the notifier before entering the core scheduler. There might be false positives when the task is woken between that call and actually scheduling, but that's not really different from scheduling and being woken immediately after switching away. When nr_running is updated when the task is retunrning from schedule() then it is later compared when it is done from ttwu(). [ bigeasy: preempt_disable() around wq_worker_sleeping() by Daniel Bristot de Oliveira ] Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Daniel Bristot de Oliveira Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/ad2b29b5715f970bffc1a7026cabd6ff0b24076a.1532952814.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 88 +++++++++++---------------------------------- kernel/workqueue.c | 54 +++++++++++++--------------- kernel/workqueue_internal.h | 5 +-- 3 files changed, 48 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4778c48a7fda..6184a0856aab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1685,10 +1685,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; - - /* If a worker is waking up, notify the workqueue: */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); } /* @@ -2106,56 +2102,6 @@ out: return success; } -/** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet picked a replacement task. - */ - rq_unlock(rq, rf); - raw_spin_lock(&p->pi_lock); - rq_relock(rq, rf); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - trace_sched_waking(p); - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&rq->nr_iowait); - } - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); - } - - ttwu_do_wakeup(rq, p, 0, rf); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. @@ -3472,19 +3418,6 @@ static void __sched notrace __schedule(bool preempt) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) - try_to_wake_up_local(to_wakeup, &rf); - } } switch_count = &prev->nvcsw; } @@ -3544,6 +3477,20 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again + * in the possible wakeup of a kworker. + */ + if (tsk->flags & PF_WQ_WORKER) { + preempt_disable(); + wq_worker_sleeping(tsk); + preempt_enable_no_resched(); + } + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3552,6 +3499,12 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -3562,6 +3515,7 @@ asmlinkage __visible void __sched schedule(void) __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddee541ea97a..56180c9286f5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool) } /** - * wq_worker_waking_up - a worker is waking up + * wq_worker_running - a worker is running again * @task: task waking up - * @cpu: CPU @task is waking up to * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->cpu != cpu); + if (!worker->sleeping) + return; + if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); - } + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * Return: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct worker *next, *worker = kthread_data(task); struct worker_pool *pool; /* @@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; pool = worker->pool; - /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) - return NULL; + if (WARN_ON_ONCE(worker->sleeping)) + return; + + worker->sleeping = 1; + spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - to_wakeup = first_idle_worker(pool); - return to_wakeup ? to_wakeup->task : NULL; + !list_empty(&pool->worklist)) { + next = first_idle_worker(pool); + if (next) + wake_up_process(next->task); + } + spin_unlock_irq(&pool->lock); } /** @@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool) * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in - * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -44,6 +44,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 1b174a2cb67a3a156d5a28426ae14241e6dfa655 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Apr 2019 09:53:13 +0200 Subject: sched/core: Remove ttwu_activate() After the removal of try_to_wake_up_local(), there is only one user of ttwu_activate() left, and since it is a trivial function, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6184a0856aab..3feb83df322e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1681,12 +1681,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) __schedstat_inc(p->se.statistics.nr_wakeups_sync); } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; -} - /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1738,7 +1732,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, en_flags); + activate_task(rq, p, en_flags); + p->on_rq = TASK_ON_RQ_QUEUED; ttwu_do_wakeup(rq, p, wake_flags, rf); } -- cgit v1.2.3 From 7dd7788411646c9619aa6495f832bc0a9b0146b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Apr 2019 09:59:05 +0200 Subject: sched/core: Unify p->on_rq updates Almost all {,de}activate_task() invocations pair with p->on_rq updates, the exception being the usage in rt/deadline which hold both rq locks and therefore don't strictly need to set TASK_ON_RQ_MIGRATING, but it is harmless if we do anyway. Put the updates in {,de}activate_task() and cut down on repetition. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++----- kernel/sched/fair.c | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3feb83df322e..f4838b78b9f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); + + p->on_rq = TASK_ON_RQ_QUEUED; } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -1237,11 +1241,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -1733,7 +1735,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #endif activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; ttwu_do_wakeup(rq, p, wake_flags, rf); } @@ -2408,7 +2409,6 @@ void wake_up_new_task(struct task_struct *p) post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3407,7 +3407,6 @@ static void __sched notrace __schedule(bool preempt) prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b6cc0703b850..e5b100b6ba4e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7491,7 +7491,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -7627,7 +7626,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } -- cgit v1.2.3 From 2d65c42b43e53d61f1fd6b8d0a097451a4cffa24 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 10 Apr 2019 12:09:14 -0500 Subject: genirq/devres: Use struct_size() in devm_kzalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; size = sizeof(struct foo) + count * sizeof(struct boo); instance = devm_kzalloc(dev, size, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper. instance = devm_kzalloc(dev, struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190410170914.GA16161@embeddedor --- kernel/irq/devres.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index f808c6a97dcc..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, irq_flow_handler_t handler) { struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - gc = devm_kzalloc(dev, sz, GFP_KERNEL); + gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL); if (gc) irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, handler); -- cgit v1.2.3 From 0d306c31b2f77391dacdeaad4470c577f2aecc4f Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 16 Apr 2019 18:13:01 +0900 Subject: bpf: use BPF_CAST_CALL for casting bpf call verifier.c uses BPF_CAST_CALL for casting bpf call except at one place in jit_subprogs(). Let's use the macro for consistency. Signed-off-by: Prashant Bhole Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7220153c5b1..db301e9b5295 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7647,9 +7647,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses -- cgit v1.2.3 From 98af8452945c55652de68536afdde3b520fec429 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 12 Apr 2019 15:39:28 -0500 Subject: cpu/speculation: Add 'mitigations=' cmdline option Keeping track of the number of mitigations for all the CPU speculation bugs has become overwhelming for many users. It's getting more and more complicated to decide which mitigations are needed for a given architecture. Complicating matters is the fact that each arch tends to have its own custom way to mitigate the same vulnerability. Most users fall into a few basic categories: a) they want all mitigations off; b) they want all reasonable mitigations on, with SMT enabled even if it's vulnerable; or c) they want all reasonable mitigations on, with SMT disabled if vulnerable. Define a set of curated, arch-independent options, each of which is an aggregation of existing options: - mitigations=off: Disable all mitigations. - mitigations=auto: [default] Enable all the default mitigations, but leave SMT enabled, even if it's vulnerable. - mitigations=auto,nosmt: Enable all the default mitigations, disabling SMT if needed by a mitigation. Currently, these options are placeholders which don't actually do anything. They will be fleshed out in upcoming patches. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina (on x86) Reviewed-by: Jiri Kosina Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Waiman Long Cc: Andrea Arcangeli Cc: Jon Masters Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-s390@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-arch@vger.kernel.org Cc: Greg Kroah-Hartman Cc: Tyler Hicks Cc: Linus Torvalds Cc: Randy Dunlap Cc: Steven Price Cc: Phil Auld Link: https://lkml.kernel.org/r/b07a8ef9b7c5055c3a4637c87d07c296d5016fe0.1555085500.git.jpoimboe@redhat.com --- kernel/cpu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d1c6d152da89..e70a90634b41 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2279,3 +2279,18 @@ void __init boot_cpu_hotplug_init(void) #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } + +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + cpu_mitigations = CPU_MITIGATIONS_OFF; + else if (!strcmp(arg, "auto")) + cpu_mitigations = CPU_MITIGATIONS_AUTO; + else if (!strcmp(arg, "auto,nosmt")) + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + + return 0; +} +early_param("mitigations", mitigations_parse_cmdline); -- cgit v1.2.3 From 77361825bb01ecadf3ac8622e2e4dbc28806e858 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:32 +0200 Subject: bpf: cpumap use ptr_ring_consume_batched Move ptr_ring dequeue outside loop, that allocate SKBs and calls network stack, as these operations that can take some time. The ptr_ring is a communication channel between CPUs, where we want to reduce/limit any cacheline bouncing. Do a concentrated bulk dequeue via ptr_ring_consume_batched, to shorten the period and times the remote cacheline in ptr_ring is read Batch size 8 is both to (1) limit BH-disable period, and (2) consume one cacheline on 64-bit archs. After reducing the BH-disable section further then we can consider changing this, while still thinking about L1 cacheline size being active. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3c18260403dd..430103e182a0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -252,8 +254,9 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + int i, n; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -269,14 +272,16 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb; int ret; @@ -290,13 +295,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } -- cgit v1.2.3 From 8f0504a97e1ba6b70e1c8b5a88255c280f263287 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:43 +0200 Subject: bpf: cpumap do bulk allocation of SKBs As cpumap now batch consume xdp_frame's from the ptr_ring, it knows how many SKBs it need to allocate. Thus, lets bulk allocate these SKBs via kmem_cache_alloc_bulk() API, and use the previously introduced function build_skb_around(). Notice that the flag __GFP_ZERO asks the slab/slub allocator to clear the memory for us. This does clear a larger area than needed, but my micro benchmarks on Intel CPUs show that this is slightly faster due to being a cacheline aligned area is cleared for the SKBs. (For SLUB allocator, there is a future optimization potential, because SKBs will with high probability originate from same page. If we can find/identify continuous memory areas then the Intel CPU memset rep stos will have a real performance gain.) Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 430103e182a0..732d6ced3987 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; @@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; skb_reserve(skb, hard_start_headroom); @@ -256,7 +256,9 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; - int i, n; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -278,14 +280,20 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } local_bh_disable(); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; -- cgit v1.2.3 From 86d231459d6dc9094e70c35c3517f4ef860b2f1e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:48 +0200 Subject: bpf: cpumap memory prefetchw optimizations for struct page A lot of the performance gain comes from this patch. While analysing performance overhead it was found that the largest CPU stalls were caused when touching the struct page area. It is first read with a READ_ONCE from build_skb_around via page_is_pfmemalloc(), and when freed written by page_frag_free() call. Measurements show that the prefetchw (W) variant operation is needed to achieve the performance gain. We believe this optimization it two fold, first the W-variant saves one step in the cache-coherency protocol, and second it helps us to avoid the non-temporal prefetch HW optimizations and bring this into all cache-levels. It might be worth investigating if prefetch into L2 will have the same benefit. Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 732d6ced3987..cf727d77c6c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -280,6 +280,18 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); if (unlikely(m == 0)) { for (i = 0; i < n; i++) -- cgit v1.2.3 From 0d2cc3b3453254f1c56f9456ba03e092ed4cfb72 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Apr 2019 18:02:41 +0200 Subject: locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING valid_state() and print_usage_bug*() functions are not used beyond irq locking correctness checks under CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING. Sadly the "unused function" warning wouldn't fire because valid_state() is inline so the unused case has remained unseen until now. So move them inside the appropriate CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING section. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190402160244.32434-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34cdcbedda49..9c5819ef4a28 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2784,6 +2784,12 @@ static void check_chain_key(struct task_struct *curr) #endif } +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + + static void print_usage_bug_scenario(struct held_lock *lock) { @@ -2853,10 +2859,6 @@ valid_state(struct task_struct *curr, struct held_lock *this, return 1; } -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: -- cgit v1.2.3 From c902a1e8d9c9b47cd8faa16892710247cdda9b02 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Apr 2019 18:02:42 +0200 Subject: locking/lockdep: Map remaining magic numbers to lock usage mask names Clarify the code with mapping some more constant numbers that haven't been named after their corresponding LOCK_USAGE_* symbol. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190402160244.32434-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9c5819ef4a28..2288aa2fa4c6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -516,11 +516,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { char c = '.'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '+'; if (class->usage_mask & lock_flag(bit)) { c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '?'; } @@ -1971,7 +1971,10 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + if (bit & LOCK_USAGE_READ_MASK) + return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; + else + return state_names[bit >> LOCK_USAGE_DIR_MASK]; } static int exclusive_bit(int new_bit) @@ -3017,7 +3020,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { - return state_verbose_f[bit >> 2](class); + return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, -- cgit v1.2.3 From 627f364d24c009b61c9199b2c75006e35c294675 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Apr 2019 18:02:43 +0200 Subject: locking/lockdep: Use expanded masks on find_usage_*() functions In order to optimize check_irq_usage() and factorize all the IRQ usage validations we'll need to be able to check multiple lock usage bits at once. Prepare the low level usage mask check functions for that purpose. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190402160244.32434-4-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2288aa2fa4c6..5e149dd78298 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1682,9 +1682,9 @@ check_redundant(struct lock_list *root, struct lock_class *target, * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); + return entry->class->usage_mask & *(unsigned long *)mask; } @@ -1700,14 +1700,14 @@ static inline int usage_match(struct lock_list *entry, void *bit) * Return <0 on error. */ static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1723,14 +1723,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, * Return <0 on error. */ static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1935,7 +1935,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); + ret = find_usage_backwards(&this, lock_flag(bit_backwards), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -1943,7 +1943,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev, that.parent = NULL; that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); + ret = find_usage_forwards(&that, lock_flag(bit_forwards), &target_entry1); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2941,7 +2941,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); + ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2965,7 +2965,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); + ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) -- cgit v1.2.3 From 8808a7c65423cdd07ea12f9ecd812e56c7857421 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Apr 2019 13:59:03 +0200 Subject: locking/lockdep: Generate LOCKF_ bit composites Instead of open-coding the bitmasks, generate them using the lockdep_states.h header. This prepares for additional states, which would make the manual masks tedious and error prone. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d4c197425f68..2b3ffd4117ad 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -42,13 +42,29 @@ enum { __LOCKF(USED) }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, -- cgit v1.2.3 From 76e1552466ff2da8b909df0fff3600ec1c27edcc Mon Sep 17 00:00:00 2001 From: Arash Fotouhi Date: Fri, 22 Mar 2019 19:28:32 -0700 Subject: watchdog: Fix typo in comment Signed-off-by: Arash Fotouhi Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: loberman@redhat.com Cc: vincent.whitchurch@axis.com Link: http://lkml.kernel.org/r/1553308112-3513-1-git-send-email-arash@arashfotouhi.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6a5787233113..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -590,7 +590,7 @@ static void lockup_detector_reconfigure(void) * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When - * the threads are sucessfully initialized, take the proper locks and + * the threads are successfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ static __init void lockup_detector_setup(void) -- cgit v1.2.3 From 471ba0e686cb13752bc1ff3216c54b69a2d250ea Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 9 Apr 2019 19:34:03 +1000 Subject: irq_work: Do not raise an IPI when queueing work on the local CPU The QEMU PowerPC/PSeries machine model was not expecting a self-IPI, and it may be a bit surprising thing to do, so have irq_work_queue_on do local queueing when target is the current CPU. Suggested-by: Steven Rostedt Reported-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Cc: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Suraj Jitindar Singh Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190409093403.20994-1-npiggin@gmail.com [ Simplified the preprocessor comments. Fixed unbalanced curly brackets pointed out by Thomas. ] Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 75 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..73288914ed5e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void) */ } -/* - * Enqueue the irq_work @work on @cpu unless it's already pending - * somewhere. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue_on(struct irq_work *work, int cpu) +/* Enqueue on current CPU, work must already be claimed and preempt disabled */ +static void __irq_work_queue_local(struct irq_work *work) { - /* All work should have been flushed before going offline */ - WARN_ON_ONCE(cpu_is_offline(cpu)); - -#ifdef CONFIG_SMP - - /* Arch remote IPI send/receive backend aren't NMI safe */ - WARN_ON_ONCE(in_nmi()); + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + arch_irq_work_raise(); + } +} +/* Enqueue the irq work @work on the current CPU */ +bool irq_work_queue(struct irq_work *work) +{ /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); - -#else /* #ifdef CONFIG_SMP */ - irq_work_queue(work); -#endif /* #else #ifdef CONFIG_SMP */ + /* Queue the entry and raise the IPI if needed. */ + preempt_disable(); + __irq_work_queue_local(work); + preempt_enable(); return true; } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* Enqueue the irq work @work on the current CPU */ -bool irq_work_queue(struct irq_work *work) +/* + * Enqueue the irq_work @work on @cpu unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue_on(struct irq_work *work, int cpu) { +#ifndef CONFIG_SMP + return irq_work_queue(work); + +#else /* CONFIG_SMP: */ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - - /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); + __irq_work_queue_local(work); } - preempt_enable(); return true; +#endif /* CONFIG_SMP */ } -EXPORT_SYMBOL_GPL(irq_work_queue); + bool irq_work_needs_cpu(void) { -- cgit v1.2.3 From b1546edcf2aab710a5afc98d65c948a4bfac0353 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 18 Apr 2019 22:47:13 +0800 Subject: sched/core: Make some functions static Fix these sparse warnings: kernel/sched/core.c:6577:11: warning: symbol 'min_cfs_quota_period' was not declared. Should it be static? kernel/sched/core.c:6657:5: warning: symbol 'tg_set_cfs_quota' was not declared. Should it be static? kernel/sched/core.c:6670:6: warning: symbol 'tg_get_cfs_quota' was not declared. Should it be static? kernel/sched/core.c:6683:5: warning: symbol 'tg_set_cfs_period' was not declared. Should it be static? kernel/sched/core.c:6693:6: warning: symbol 'tg_get_cfs_period' was not declared. Should it be static? kernel/sched/fair.c:2596:6: warning: symbol 'task_tick_numa' was not declared. Should it be static? Signed-off-by: YueHaibing Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190418144713.34332-1-yuehaibing@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/fair.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f4838b78b9f9..8b64ef0d5589 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6522,7 +6522,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6602,7 +6602,7 @@ out_unlock: return ret; } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; @@ -6615,7 +6615,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; @@ -6628,7 +6628,7 @@ long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; @@ -6638,7 +6638,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e5b100b6ba4e..13bafe350abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2593,7 +2593,7 @@ out: /* * Drive the periodic memory faults.. */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9ec050bcf46..599510a3355e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3326,6 +3326,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ @@ -3366,3 +3371,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); -- cgit v1.2.3 From bee9853932e90ce94bce4276ec6b7b06bc48070b Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Wed, 6 Mar 2019 20:13:33 -0500 Subject: sched/core: Fix typo in comment Signed-off-by: Joel Savitz Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1551921213-813-1-git-send-email-jsavitz@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b64ef0d5589..fb09eaad1d3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -924,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -- cgit v1.2.3 From bff9504bfc9c5c6610b42d47f689f350fd969eb8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 Mar 2019 14:47:53 -0500 Subject: rseq: Clean up comments by reflecting removal of event counter The "event counter" was removed from rseq before it was merged upstream. However, a few comments in the source code still refer to it. Adapt the comments to match reality. Signed-off-by: Mathieu Desnoyers Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: H. Peter Anvin Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E. McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20190305194755.2602-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/rseq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..849afe749131 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs) * - signal delivery, * and return to user-space. * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. -- cgit v1.2.3 From 83b0b15bcb0f700e7c1d070aae2e7841170a4c33 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 Mar 2019 14:47:54 -0500 Subject: rseq: Remove superfluous rseq_len from task_struct The rseq system call, when invoked with flags of "0" or "RSEQ_FLAG_UNREGISTER" values, expects the rseq_len parameter to be equal to sizeof(struct rseq), which is fixed-size and fixed-layout, specified in uapi linux/rseq.h. Expecting a fixed size for rseq_len is a design choice that ensures multiple libraries and application defining __rseq_abi in the same process agree on its exact size. Considering that this size is and will always be the same value, there is no point in saving this value within task_struct rseq_len. Remove this field from task_struct. No change in functionality intended. Signed-off-by: Mathieu Desnoyers Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: H. Peter Anvin Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E. McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20190305194755.2602-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/rseq.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 849afe749131..9424ee90589e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -313,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (current->rseq_len != rseq_len) + if (rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -321,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (ret) return ret; current->rseq = NULL; - current->rseq_len = 0; current->rseq_sig = 0; return 0; } @@ -335,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || current->rseq_len != rseq_len) + if (current->rseq != rseq || rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -353,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; - current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been -- cgit v1.2.3 From f6c6010a07734103a31faa0cc977641b358c45b0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 5 Mar 2019 16:34:32 +0800 Subject: mm/resource: Use resource_overlaps() to simplify region_intersects() The three checks in region_intersects() are basically an open-coded version of resource_overlaps() - so use the real thing. Also fix typos in comments while at it. Signed-off-by: Wei Yang Reviewed-by: Like Xu Reviewed-by: Yuan Yao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: bhelgaas@google.com Cc: bp@suse.de Cc: dan.j.williams@intel.com Cc: jack@suse.cz Cc: rdunlap@infradead.org Cc: tiwai@suse.de Link: http://lkml.kernel.org/r/20190305083432.23675-1-richardw.yang@linux.intel.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/iomem.c | 4 ++-- kernel/resource.c | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..93c264444510 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size, * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return * a pointer into the direct map. * @@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { /* - * MEMREMAP_WB is special in that it can be satisifed + * MEMREMAP_WB is special in that it can be satisfied * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where * the requested range is potentially in System RAM. diff --git a/kernel/resource.c b/kernel/resource.c index 92190f62ebc5..8c15f846e8ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - resource_size_t end = start + size - 1; + struct resource res; int type = 0; int other = 0; struct resource *p; + res.start = start; + res.end = start + size - 1; + read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); - if (start >= p->start && start <= p->end) - is_type ? type++ : other++; - if (end >= p->start && end <= p->end) - is_type ? type++ : other++; - if (p->start >= start && p->end <= end) + if (resource_overlaps(p, &res)) is_type ? type++ : other++; } read_unlock(&resource_lock); -- cgit v1.2.3 From 1a010e29cfa00fee2888fd2fd4983f848cbafb58 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 Feb 2019 11:10:17 +0300 Subject: sched/rt: Check integer overflow at usec to nsec conversion Example of unhandled overflows: # echo 18446744073709651 > cpu.rt_runtime_us # cat cpu.rt_runtime_us 99 # echo 18446744073709900 > cpu.rt_period_us # cat cpu.rt_period_us 348 After this patch they will fail with -EINVAL. Signed-off-by: Konstantin Khlebnikov Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/155125501739.293431.5252197504404771496.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 90fa23d36565..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; -- cgit v1.2.3 From 5b61d50ab4ef590f5e1d4df15cd2cea5f5715308 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 Feb 2019 11:10:18 +0300 Subject: sched/core: Handle overflow in cpu_shares_write_u64 Bit shift in scale_load() could overflow shares. This patch saturates it to MAX_SHARES like following sched_group_set_shares(). Example: # echo 9223372036854776832 > cpu.shares # cat cpu.shares Before patch: 1024 After pattch: 262144 Signed-off-by: Konstantin Khlebnikov Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/155125501891.293431.3345233332801109696.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb09eaad1d3a..685b1541ce51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6507,6 +6507,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -- cgit v1.2.3 From 1a8b4540db732ca16c9e43ac7c08b1b8f0b252d8 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 Feb 2019 11:10:20 +0300 Subject: sched/core: Check quota and period overflow at usec to nsec conversion Large values could overflow u64 and pass following sanity checks. # echo 18446744073750000 > cpu.cfs_period_us # cat cpu.cfs_period_us 40448 # echo 18446744073750000 > cpu.cfs_quota_us # cat cpu.cfs_quota_us 40448 After this patch they will fail with -EINVAL. Signed-off-by: Konstantin Khlebnikov Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/155125502079.293431.3947497929372138600.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 685b1541ce51..de8ab411826c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6611,8 +6611,10 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; - else + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } @@ -6634,6 +6636,9 @@ static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; + if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; -- cgit v1.2.3 From 13e792a19d4e3a1c64e94197ba357685fd584ded Mon Sep 17 00:00:00 2001 From: Laurent Gauthier Date: Sat, 5 Jan 2019 00:07:45 +0100 Subject: tick: Fix typos in comments Signed-off-by: Laurent Gauthier Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7541cbca695e..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -809,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * either the CPU handling the broadcast * interrupt or we got woken by something else. * - * We are not longer in the broadcast mask, so + * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return - * to idle and therefor rearm the broadcast + * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event -- cgit v1.2.3 From b6fbbf31d15b5072250ec6ed79e415a1160e5621 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Dec 2018 14:34:44 +0100 Subject: cgroup/cpuset: Update stale generate_sched_domains() comments Commit: fc560a26acce ("cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()") removed the local list (q) that was used to perform a top-down scan of all cpusets; however, comments mentioning it were not updated. Update comments to reflect current implementation. Signed-off-by: Juri Lelli Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cgroups@vger.kernel.org Cc: lizefan@huawei.com Link: http://lkml.kernel.org/r/20181219133445.31982-1-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4834c4214e9c..6a1942ed781c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -740,11 +740,10 @@ static inline int nr_cpusets(void) * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -775,7 +774,7 @@ static inline int nr_cpusets(void) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ -- cgit v1.2.3 From cb0c04143b6196f4a479ba113706329fc667ee15 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Dec 2018 14:34:45 +0100 Subject: sched/topology: Update init_sched_domains() comment Holding hotplug lock is not a requirement anymore for callers of sched_ init_domains after commit: 6acce3ef8452 ("sched: Remove get_online_cpus() usage") Update the relative comment preceding init_sched_domains(). Signed-off-by: Juri Lelli Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cgroups@vger.kernel.org Cc: lizefan@huawei.com Link: http://lkml.kernel.org/r/20181219133445.31982-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c65b31e9458b..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2081,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) } /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups. For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future. */ int sched_init_domains(const struct cpumask *cpu_map) { -- cgit v1.2.3 From 50943f3e136adfc421f9768d6ae09ba7b83aaefd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:01 -0700 Subject: cgroup: rename freezer.c into legacy_freezer.c Freezer.c will contain an implementation of cgroup v2 freezer, so let's rename the v1 freezer to avoid naming conflicts. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/freezer.c | 481 ----------------------------------------- kernel/cgroup/legacy_freezer.c | 481 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 482 insertions(+), 482 deletions(-) delete mode 100644 kernel/cgroup/freezer.c create mode 100644 kernel/cgroup/legacy_freezer.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..8d5689ca94b9 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c deleted file mode 100644 index 08236798d173..000000000000 --- a/kernel/cgroup/freezer.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ - - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; - -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; - -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct freezer, css) : NULL; -} - -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return css_freezer(task_css(task, freezer_cgrp_id)); -} - -static struct freezer *parent_freezer(struct freezer *freezer) -{ - return css_freezer(freezer->css.parent); -} - -bool cgroup_freezing(struct task_struct *task) -{ - bool ret; - - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); - - return ret; -} - -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; - - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); - - return &freezer->css; -} - -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. - */ -static int freezer_css_online(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; -} - -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. - */ -static void freezer_css_offline(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); -} - -static void freezer_css_free(struct cgroup_subsys_state *css) -{ - kfree(css_freezer(css)); -} - -/* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. - * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. - */ -static void freezer_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); - - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } - } - - mutex_unlock(&freezer_mutex); -} - -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. - */ -static void freezer_fork(struct task_struct *task) -{ - struct freezer *freezer; - - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, skipping is obviously the right thing - * to do. If we lost and root is the new cgroup, noop is still the - * right thing to do. - */ - if (task_css_is_root(task, freezer_cgrp_id)) - return; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - freezer = task_freezer(task); - if (freezer->state & CGROUP_FREEZING) - freeze_task(task); - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); -} - -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function. If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here. See freezer_attach() for details. - */ -static void update_if_frozen(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - struct cgroup_subsys_state *pos; - struct css_task_iter it; - struct task_struct *task; - - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZING) || - (freezer->state & CGROUP_FROZEN)) - return; - - /* are all (live) children frozen? */ - rcu_read_lock(); - css_for_each_child(pos, css) { - struct freezer *child = css_freezer(pos); - - if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); - - /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); - - while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } - } - - freezer->state |= CGROUP_FROZEN; -out_iter_end: - css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ - struct cgroup_subsys_state *css = seq_css(m), *pos; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - /* update states bottom-up */ - css_for_each_descendant_post(pos, css) { - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - update_if_frozen(pos); - - rcu_read_lock(); - css_put(pos); - } - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); - - seq_puts(m, freezer_state_strs(css_freezer(css)->state)); - seq_putc(m, '\n'); - return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - freeze_task(task); - css_task_iter_end(&it); -} - -static void unfreeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - __thaw_task(task); - css_task_iter_end(&it); -} - -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. - */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, - unsigned int state) -{ - /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZER_ONLINE)) - return; - - if (freeze) { - if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); - freezer->state |= state; - freeze_cgroup(freezer); - } else { - bool was_freezing = freezer->state & CGROUP_FREEZING; - - freezer->state &= ~state; - - if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~CGROUP_FROZEN; - unfreeze_cgroup(freezer); - } - } -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze. The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ - struct cgroup_subsys_state *pos; - - /* - * Update all its descendants in pre-order traversal. Each - * descendant will try to inherit its parent's FREEZING state as - * CGROUP_FREEZING_PARENT. - */ - mutex_lock(&freezer_mutex); - rcu_read_lock(); - css_for_each_descendant_pre(pos, &freezer->css) { - struct freezer *pos_f = css_freezer(pos); - struct freezer *parent = parent_freezer(pos_f); - - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - if (pos_f == freezer) - freezer_apply_state(pos_f, freeze, - CGROUP_FREEZING_SELF); - else - freezer_apply_state(pos_f, - parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); - - rcu_read_lock(); - css_put(pos); - } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); -} - -static ssize_t freezer_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - bool freeze; - - buf = strstrip(buf); - - if (strcmp(buf, freezer_state_strs(0)) == 0) - freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) - freeze = true; - else - return -EINVAL; - - freezer_change_state(css_freezer(of_css(of)), freeze); - return nbytes; -} - -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); - - return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} - -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); - - return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} - -static struct cftype files[] = { - { - .name = "state", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = freezer_read, - .write = freezer_write, - }, - { - .name = "self_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_self_freezing_read, - }, - { - .name = "parent_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_parent_freezing_read, - }, - { } /* terminate */ -}; - -struct cgroup_subsys freezer_cgrp_subsys = { - .css_alloc = freezer_css_alloc, - .css_online = freezer_css_online, - .css_offline = freezer_css_offline, - .css_free = freezer_css_free, - .attach = freezer_attach, - .fork = freezer_fork, - .legacy_cftypes = files, -}; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { + struct cgroup_subsys_state css; + unsigned int state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ + return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ + bool ret; + + rcu_read_lock(); + ret = task_freezer(task)->state & CGROUP_FREEZING; + rcu_read_unlock(); + + return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct freezer *parent = parent_freezer(freezer); + + mutex_lock(&freezer_mutex); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + mutex_unlock(&freezer_mutex); + return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + + mutex_lock(&freezer_mutex); + + if (freezer->state & CGROUP_FREEZING) + atomic_dec(&system_freezing_cnt); + + freezer->state = 0; + + mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *new_css; + + mutex_lock(&freezer_mutex); + + /* + * Make the new tasks conform to the current state of @new_css. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_css but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + */ + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } + } + } + + mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ + struct freezer *freezer; + + /* + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. + */ + if (task_css_is_root(task, freezer_cgrp_id)) + return; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) + freeze_task(task); + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + return; + + /* are all (live) children frozen? */ + rcu_read_lock(); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + /* are all tasks frozen? */ + css_task_iter_start(css, 0, &it); + + while ((task = css_task_iter_next(&it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } + } + + freezer->state |= CGROUP_FROZEN; +out_iter_end: + css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ + struct cgroup_subsys_state *css = seq_css(m), *pos; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + /* update states bottom-up */ + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + update_if_frozen(pos); + + rcu_read_lock(); + css_put(pos); + } + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); + + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); + seq_putc(m, '\n'); + return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + freeze_task(task); + css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + __thaw_task(task); + css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) +{ + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) + atomic_inc(&system_freezing_cnt); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } + } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup_subsys_state *pos; + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + mutex_lock(&freezer_mutex); + rcu_read_lock(); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + if (pos_f == freezer) + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + else + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + bool freeze; + + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; + else + return -EINVAL; + + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { + { + .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = freezer_read, + .write = freezer_write, + }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, + .attach = freezer_attach, + .fork = freezer_fork, + .legacy_cftypes = files, +}; -- cgit v1.2.3 From aade7f9efba098859681f8e88d81a5b44ad09b12 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:02 -0700 Subject: cgroup: implement __cgroup_task_count() helper The helper is identical to the existing cgroup_task_count() except it doesn't take the css_set_lock by itself, assuming that the caller does. Also, move cgroup_task_count() implementation into kernel/cgroup/cgroup.c, as there is nothing specific to cgroup v1. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 16 ---------------- kernel/cgroup/cgroup.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..02c001ffe2e2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -240,6 +240,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..68ca5de7ec27 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -342,22 +342,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f219c195a9a5..3008ea684aa0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; -- cgit v1.2.3 From 4dcabece4c3a9f9522127be12cc12cc120399b2f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:03 -0700 Subject: cgroup: protect cgroup->nr_(dying_)descendants by css_set_lock The number of descendant cgroups and the number of dying descendant cgroups are currently synchronized using the cgroup_mutex. The number of descendant cgroups will be required by the cgroup v2 freezer, which will use it to determine if a cgroup is frozen (depending on total number of descendants and number of frozen descendants). It's not always acceptable to grab the cgroup_mutex, especially from quite hot paths (e.g. exit()). To avoid this, let's additionally synchronize these counters using the css_set_lock. So, it's safe to read these counters with either cgroup_mutex or css_set_lock locked, and for changing both locks should be acquired. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3008ea684aa0..786ceef2f222 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4811,9 +4811,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5031,12 +5033,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (tcgrp != cgrp) tcgrp->nr_descendants++; } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5321,10 +5325,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); -- cgit v1.2.3 From 76f969e8948d82e78e1bc4beb6b9465908e74873 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:04 -0700 Subject: cgroup: cgroup v2 freezer Cgroup v1 implements the freezer controller, which provides an ability to stop the workload in a cgroup and temporarily free up some resources (cpu, io, network bandwidth and, potentially, memory) for some other tasks. Cgroup v2 lacks this functionality. This patch implements freezer for cgroup v2. Cgroup v2 freezer tries to put tasks into a state similar to jobctl stop. This means that tasks can be killed, ptraced (using PTRACE_SEIZE*), and interrupted. It is possible to attach to a frozen task, get some information (e.g. read registers) and detach. It's also possible to migrate a frozen tasks to another cgroup. This differs cgroup v2 freezer from cgroup v1 freezer, which mostly tried to imitate the system-wide freezer. However uninterruptible sleep is fine when all tasks are going to be frozen (hibernation case), it's not the acceptable state for some subset of the system. Cgroup v2 freezer is not supporting freezing kthreads. If a non-root cgroup contains kthread, the cgroup still can be frozen, but the kthread will remain running, the cgroup will be shown as non-frozen, and the notification will not be delivered. * PTRACE_ATTACH is not working because non-fatal signal delivery is blocked in frozen state. There are some interface differences between cgroup v1 and cgroup v2 freezer too, which are required to conform the cgroup v2 interface design principles: 1) There is no separate controller, which has to be turned on: the functionality is always available and is represented by cgroup.freeze and cgroup.events cgroup control files. 2) The desired state is defined by the cgroup.freeze control file. Any hierarchical configuration is allowed. 3) The interface is asynchronous. The actual state is available using cgroup.events control file ("frozen" field). There are no dedicated transitional states. 4) It's allowed to make any changes with the cgroup hierarchy (create new cgroups, remove old cgroups, move tasks between cgroups) no matter if some cgroups are frozen. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo No-objection-from-me-by: Oleg Nesterov Cc: kernel-team@fb.com --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup.c | 110 ++++++++++++++++- kernel/cgroup/freezer.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 2 + kernel/signal.c | 70 ++++++++++- 5 files changed, 491 insertions(+), 10 deletions(-) create mode 100644 kernel/cgroup/freezer.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 8d5689ca94b9..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 786ceef2f222..6895464b54c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2435,8 +2435,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -3477,8 +3484,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3540,6 +3550,40 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) } #endif +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4683,6 +4727,12 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, @@ -5033,12 +5083,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } spin_unlock_irq(&css_set_lock); @@ -5329,6 +5396,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); @@ -5782,6 +5855,29 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + struct cgroup *cgrp; + + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + cgrp = cset->dfl_cgrp; + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5830,6 +5926,12 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; + + if (unlikely(cgroup_task_frozen(tsk))) + cgroup_freezer_frozen_exit(tsk); + else if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c new file mode 100644 index 000000000000..9d8cda478fc9 --- /dev/null +++ b/kernel/cgroup/freezer.c @@ -0,0 +1,317 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "cgroup-internal.h" + +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) +{ + int desc = 1; + + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + desc++; + } + } + } +} + +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) +{ + bool frozen; + + lockdep_assert_held(&css_set_lock); + + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); + + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; + + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; + + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); +} + +/* + * Increment cgroup's nr_frozen_tasks. + */ +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) +{ + cgrp->freezer.nr_frozen_tasks++; +} + +/* + * Decrement cgroup's nr_frozen_tasks. + */ +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) +{ + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); +} + +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) +{ + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); +} + +/* + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + * + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. + */ +void cgroup_leave_frozen(bool always_leave) +{ + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } + spin_unlock_irq(&css_set_lock); + + if (unlikely(current->frozen)) { + /* + * If the task remained in the frozen state, + * make sure it won't reach userspace without + * entering the signal handling loop. + */ + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. + */ +static void cgroup_freeze_task(struct task_struct *task, bool freeze) +{ + unsigned long flags; + + /* If the task is about to die, don't bother with freezing it. */ + if (!lock_task_sighand(task, &flags)) + return; + + if (freeze) { + task->jobctl |= JOBCTL_TRAP_FREEZE; + signal_wake_up(task, false); + } else { + task->jobctl &= ~JOBCTL_TRAP_FREEZE; + wake_up_process(task); + } + + unlock_task_sighand(task, &flags); +} + +/* + * Freeze or unfreeze all tasks in the given cgroup. + */ +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + if (freeze) + set_bit(CGRP_FREEZE, &cgrp->flags); + else + clear_bit(CGRP_FREEZE, &cgrp->flags); + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, 0, &it); + while ((task = css_task_iter_next(&it))) { + /* + * Ignore kernel threads here. Freezing cgroups containing + * kthreads isn't supported. + */ + if (task->flags & PF_KTHREAD) + continue; + cgroup_freeze_task(task, freeze); + } + css_task_iter_end(&it); + + /* + * Cgroup state should be revisited here to cover empty leaf cgroups + * and cgroups which descendants are already in the desired state. + */ + spin_lock_irq(&css_set_lock); + if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); +} + +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups. + */ +void cgroup_freezer_migrate_task(struct task_struct *task, + struct cgroup *src, struct cgroup *dst) +{ + lockdep_assert_held(&css_set_lock); + + /* + * Kernel threads are not supposed to be frozen at all. + */ + if (task->flags & PF_KTHREAD) + return; + + /* + * Adjust counters of freezing and frozen tasks. + * Note, that if the task is frozen, but the destination cgroup is not + * frozen, we bump both counters to keep them balanced. + */ + if (task->frozen) { + cgroup_inc_frozen_cnt(dst); + cgroup_dec_frozen_cnt(src); + } + cgroup_update_frozen(dst); + cgroup_update_frozen(src); + + /* + * Force the task to the desired state. + */ + cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); +} + +void cgroup_freezer_frozen_exit(struct task_struct *task) +{ + struct cgroup *cgrp = task_dfl_cgroup(task); + + lockdep_assert_held(&css_set_lock); + + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); +} + +void cgroup_freeze(struct cgroup *cgrp, bool freeze) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + bool applied = false; + + lockdep_assert_held(&cgroup_mutex); + + /* + * Nothing changed? Just exit. + */ + if (cgrp->freezer.freeze == freeze) + return; + + cgrp->freezer.freeze = freeze; + + /* + * Propagate changes downwards the cgroup tree. + */ + css_for_each_descendant_pre(css, &cgrp->self) { + dsct = css->cgroup; + + if (cgroup_is_dead(dsct)) + continue; + + if (freeze) { + dsct->freezer.e_freeze++; + /* + * Already frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 1) + continue; + } else { + dsct->freezer.e_freeze--; + /* + * Still frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 0) + continue; + + WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + } + + /* + * Do change actual state: freeze or unfreeze. + */ + cgroup_do_freeze(dsct, freeze); + applied = true; + } + + /* + * Even if the actual state hasn't changed, let's notify a user. + * The state can be enforced by an ancestor cgroup: the cgroup + * can already be in the desired state or it can be locked in the + * opposite state, so that the transition will never happen. + * In both cases it's better to notify a user, that there is + * nothing to wait for. + */ + if (!applied) + cgroup_file_notify(&cgrp->events_file); +} diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..8097a0cce4db 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,9 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); + cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); + cgroup_leave_frozen(false); freezer_count(); if (killed) { diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..095e0fc57b25 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) static bool recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & JOBCTL_PENDING_MASK) || + if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { + PENDING(&t->signal->shared_pending, &t->blocked) || + cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } @@ -2108,6 +2110,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); + cgroup_enter_frozen(); freezable_schedule(); } else { /* @@ -2286,6 +2289,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ + cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2332,6 +2336,43 @@ static void do_jobctl_trap(void) } } +/** + * do_freezer_trap - handle the freezer jobctl trap + * + * Puts the task into frozen state, if only the task is not about to quit. + * In this case it drops JOBCTL_TRAP_FREEZE. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, + * which is always released before returning. + */ +static void do_freezer_trap(void) + __releases(¤t->sighand->siglock) +{ + /* + * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, + * let's make another loop to give it a chance to be handled. + * In any case, we'll return back. + */ + if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != + JOBCTL_TRAP_FREEZE) { + spin_unlock_irq(¤t->sighand->siglock); + return; + } + + /* + * Now we're sure that there is no pending fatal signal and no + * pending traps. Clear TIF_SIGPENDING to not get out of schedule() + * immediately (if there is a non-fatal signal pending), and + * put the task into sleep. + */ + __set_current_state(TASK_INTERRUPTIBLE); + clear_thread_flag(TIF_SIGPENDING); + spin_unlock_irq(¤t->sighand->siglock); + cgroup_enter_frozen(); + freezable_schedule(); +} + static int ptrace_signal(int signr, kernel_siginfo_t *info) { /* @@ -2442,6 +2483,10 @@ relock: ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); recalc_sigpending(); + current->jobctl &= ~JOBCTL_TRAP_FREEZE; + spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); goto fatal; } @@ -2452,9 +2497,24 @@ relock: do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { - do_jobctl_trap(); + if (unlikely(current->jobctl & + (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { + if (current->jobctl & JOBCTL_TRAP_MASK) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + } else if (current->jobctl & JOBCTL_TRAP_FREEZE) + do_freezer_trap(); + + goto relock; + } + + /* + * If the task is leaving the frozen state, let's update + * cgroup counters and reset the frozen bit. + */ + if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); + cgroup_leave_frozen(true); goto relock; } @@ -2548,8 +2608,8 @@ relock: continue; } - fatal: spin_unlock_irq(&sighand->siglock); + fatal: /* * Anything else is fatal, maybe with a core dump. -- cgit v1.2.3 From 712e35178754bbb785d00d5fcf5abaf32699bf11 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:07 -0700 Subject: cgroup: make TRACE_CGROUP_PATH irq-safe To use the TRACE_CGROUP_PATH() macro with css_set_lock locked, let's make the macro irq-safe. It's necessary in order to trace cgroup freezer state transitions (frozen/not frozen), which are happening with css_set_lock locked. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 02c001ffe2e2..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) -- cgit v1.2.3 From 4c476d8cff48853645abc822154aaad208faebcc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:08 -0700 Subject: cgroup: add tracing points for cgroup v2 freezer Add cgroup:cgroup_freeze and cgroup:cgroup_unfreeze events, which are using the existing cgroup tracing infrastructure. Add the cgroup_event event class, which is similar to the cgroup class, but contains an additional integer field to store a new value (the level field is dropped). Also add two tracing events: cgroup_notify_populated and cgroup_notify_frozen, which are raised in a generic way using the TRACE_CGROUP_PATH() macro. This allows to trace cgroup state transitions and is generally helpful for debugging the cgroup freezer code. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ kernel/cgroup/freezer.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6895464b54c6..57edcf398d71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -816,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 9d8cda478fc9..3bfbb3c8baf3 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -6,6 +6,8 @@ #include "cgroup-internal.h" +#include + /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ @@ -28,6 +30,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) cgrp->nr_descendants) { set_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); desc++; } } else { @@ -35,6 +38,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) if (test_bit(CGRP_FROZEN, &cgrp->flags)) { clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); desc++; } } @@ -73,6 +77,7 @@ void cgroup_update_frozen(struct cgroup *cgrp) clear_bit(CGRP_FROZEN, &cgrp->flags); } cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); /* Update the state of ancestor cgroups. */ cgroup_propagate_frozen(cgrp, frozen); @@ -189,6 +194,11 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) clear_bit(CGRP_FREEZE, &cgrp->flags); spin_unlock_irq(&css_set_lock); + if (freeze) + TRACE_CGROUP_PATH(freeze, cgrp); + else + TRACE_CGROUP_PATH(unfreeze, cgrp); + css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* @@ -312,6 +322,9 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) * In both cases it's better to notify a user, that there is * nothing to wait for. */ - if (!applied) + if (!applied) { + TRACE_CGROUP_PATH(notify_frozen, cgrp, + test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); + } } -- cgit v1.2.3 From ad2e379def135ebc079f89a0e0b1d987d243f949 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Nov 2018 15:23:50 +0000 Subject: sched/debug: Fix spelling mistake "logaritmic" -> "logarithmic" Signed-off-by: Colin Ian King Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20181128152350.13622-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -702,7 +702,7 @@ do { \ static const char *sched_tunable_scaling_names[] = { "none", - "logaritmic", + "logarithmic", "linear" }; -- cgit v1.2.3 From 52fde6e70cccc2fcf3f39fed0d0392960e2c2b03 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 21 Apr 2019 19:40:44 -0400 Subject: function_graph: Have selftest also emulate tr->reset() as it did with tr->init() The function_graph boot up self test emulates the tr->init() function in order to add a wrapper around the function graph tracer entry code to test for lock ups and such. But it does not emulate the tr->reset(), and just calls the function_graph tracer tr->reset() function which will use its own fgraph_ops to unregister function tracing with. As the fgraph_ops is becoming more meaningful with the register_ftrace_graph() and unregister_ftrace_graph() functions, the two need to be the same. The emulated tr->init() uses its own fgraph_ops descriptor, which means the unregister_ftrace_graph() must use the same ftrace_ops, which the selftest currently does not do. By emulating the tr->reset() as the selftest does with the tr->init() it will be able to pass the same fgraph_ops descriptor to the unregister_ftrace_graph() as it did with the register_ftrace_graph(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); + /* Need to also simulate the tr->reset to remove this fgraph_ops */ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(&fgraph_ops); + tracing_start(); if (!ret && !count) { -- cgit v1.2.3 From 70c4cf17e445264453bc5323db3e50aa0ac9e81f Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 19 Apr 2019 20:49:29 -0500 Subject: audit: fix a memory leak bug In audit_rule_change(), audit_data_to_entry() is firstly invoked to translate the payload data to the kernel's rule representation. In audit_data_to_entry(), depending on the audit field type, an audit tree may be created in audit_make_tree(), which eventually invokes kmalloc() to allocate the tree. Since this tree is a temporary tree, it will be then freed in the following execution, e.g., audit_add_rule() if the message type is AUDIT_ADD_RULE or audit_del_rule() if the message type is AUDIT_DEL_RULE. However, if the message type is neither AUDIT_ADD_RULE nor AUDIT_DEL_RULE, i.e., the default case of the switch statement, this temporary tree is not freed. To fix this issue, only allocate the tree when the type is AUDIT_ADD_RULE or AUDIT_DEL_RULE. Signed-off-by: Wenwen Wang Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2c3c2f349b23..1bc6410413e6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1114,22 +1114,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) int err = 0; struct audit_entry *entry; - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - switch (type) { case AUDIT_ADD_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - err = -EINVAL; WARN_ON(1); + return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { -- cgit v1.2.3 From 7df737e991069d75eec1ded1c8b37e81b8c54df9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:54 -0700 Subject: bpf: remove global variables Move three global variables protected by bpf_verifier_lock into 'struct bpf_verifier_env' to allow parallel verification. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db301e9b5295..5f0eb5bd5589 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5369,10 +5369,6 @@ enum { #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; - /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -5380,6 +5376,9 @@ static int *insn_state; */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -5400,9 +5399,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { verbose_linfo(env, t, "%d: ", t); @@ -5426,14 +5425,15 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; @@ -5441,12 +5441,12 @@ static int check_cfg(struct bpf_verifier_env *env) insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; if (BPF_CLASS(insns[t].code) == BPF_JMP || BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5515,7 +5515,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -5535,6 +5535,7 @@ check_state: err_free: kvfree(insn_state); kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } -- cgit v1.2.3 From 45a73c17bfb92c3ceebedc80a750ef2c2931c26b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:55 -0700 Subject: bpf: drop bpf_verifier_lock Drop bpf_verifier_lock for root to avoid being DoS-ed by unprivileged. The BPF verifier is now fully parallel. All unpriv users are still serialized by bpf_verifier_lock to avoid exhausting kernel memory by running N parallel verifications. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f0eb5bd5589..423f242a5efb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8132,9 +8132,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -8157,7 +8159,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - is_priv = capable(CAP_SYS_ADMIN); env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); @@ -8270,7 +8271,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kfree(env); -- cgit v1.2.3 From 6beff00b79ca0b5caf0ce6fb8e11f57311bd95f8 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 6 Mar 2019 13:14:12 -0700 Subject: seccomp: fix up grammar in comment This sentence is kind of a train wreck anyway, but at least dropping the extra pronoun helps somewhat. Signed-off-by: Tycho Andersen Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..503d02896c5d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent, * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) -- cgit v1.2.3 From 148a97d5a02a62f81b5d6176f871c94a65e1f3af Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Apr 2019 17:24:37 +0300 Subject: dma-mapping: remove an unnecessary NULL check We already dereferenced "dev" when we called get_dma_ops() so this NULL check is too late. We're not supposed to pass NULL "dev" pointers to dma_alloc_attrs(). Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 685a53f2a793..f7afdadb6770 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -244,7 +244,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; - WARN_ON_ONCE(dev && !dev->coherent_dma_mask); + WARN_ON_ONCE(!dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; -- cgit v1.2.3 From 877b5691f27a1aec0d9b53095a323e45c30069e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 14 Apr 2019 17:37:09 -0700 Subject: crypto: shash - remove shash_desc::flags The flags field in 'struct shash_desc' never actually does anything. The only ostensibly supported flag is CRYPTO_TFM_REQ_MAY_SLEEP. However, no shash algorithm ever sleeps, making this flag a no-op. With this being the case, inevitably some users who can't sleep wrongly pass MAY_SLEEP. These would all need to be fixed if any shash algorithm actually started sleeping. For example, the shash_ahash_*() functions, which wrap a shash algorithm with the ahash API, pass through MAY_SLEEP from the ahash API to the shash API. However, the shash functions are called under kmap_atomic(), so actually they're assumed to never sleep. Even if it turns out that some users do need preemption points while hashing large buffers, we could easily provide a helper function crypto_shash_update_large() which divides the data into smaller chunks and calls crypto_shash_update() and cond_resched() for each chunk. It's not necessary to have a flag in 'struct shash_desc', nor is it necessary to make individual shash algorithms aware of this at all. Therefore, remove shash_desc::flags, and document that the crypto_shash_*() functions can be called from any context. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- kernel/kexec_file.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..f7fb8f6a688f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image) goto out_free_desc; desc->tfm = tfm; - desc->flags = 0; ret = crypto_shash_init(desc); if (ret < 0) -- cgit v1.2.3 From 52ba92f5882adf1ee785c4c5ef23491948917fcd Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:41 -0400 Subject: irqdesc: Replace irq_kobj_type's default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace irq_kobj_type's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create irq_groups. This patch was tested by verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/irqdesc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 13539e12cd80..bbec57bda666 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = { &actions_attr.attr, NULL }; +ATTRIBUTE_GROUPS(irq); static struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = irq_attrs, + .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) -- cgit v1.2.3 From 2064fbc779d43c5ac38e20a4b4979e365f87349f Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:47 -0400 Subject: padata: Replace padata_attr_type default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace padata_attr_type's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create padata_default_groups. This patch was tested by loading the pcrypt module and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Signed-off-by: Greg Kroah-Hartman --- kernel/padata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { ¶llel_cpumask_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { static struct kobj_type padata_attr_type = { .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, + .default_groups = padata_default_groups, .release = padata_sysfs_release, }; -- cgit v1.2.3 From 9782adeb3d9d6e33fc52392031b8e00270515442 Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:53 -0400 Subject: cpufreq: schedutil: Replace default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace sugov_tunables_ktype's default_attrs field with default groups. Change "sugov_attributes" to "sugov_attrs" and use the ATTRIBUTE_GROUPS macro to create sugov_groups. This patch was tested by setting the scaling governor to schedutil and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Acked-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5c41ea367422..148b60c8993d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -598,13 +598,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; -- cgit v1.2.3 From 70283454c918f1d65de0ec50c45ef592d781bcae Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:58 -0400 Subject: livepatch: Replace klp_ktype_patch's default_attrs with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace klp_ktype_patch's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create klp_patch_groups. This patch was tested by loading the livepatch-sample module and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Acked-by: Jiri Kosina Acked-by: Miroslav Benes Acked-by: Petr Mladek Signed-off-by: Greg Kroah-Hartman --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..34a8338657d2 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = { &force_kobj_attr.attr, NULL }; +ATTRIBUTE_GROUPS(klp_patch); static void klp_free_object_dynamic(struct klp_object *obj) { @@ -546,7 +547,7 @@ static void klp_kobj_release_patch(struct kobject *kobj) static struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = klp_patch_attrs, + .default_groups = klp_patch_groups, }; static void klp_kobj_release_object(struct kobject *kobj) -- cgit v1.2.3 From 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:23 -0700 Subject: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type target_fd is target namespace. If there is a flow dissector BPF program attached to that namespace, its (single) id is returned. v5: * drop net ref right after rcu unlock (Daniel Borkmann) v4: * add missing put_net (Jann Horn) v3: * add missing inline to skb_flow_dissector_prog_query static def (kbuild test robot ) v2: * don't sleep in rcu critical section (Jakub Kicinski) * check input prog_cnt (exit early) Cc: Jann Horn Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 92c9b8a32b50..b0de49598341 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2009,6 +2009,8 @@ static int bpf_prog_query(const union bpf_attr *attr, break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } -- cgit v1.2.3 From 6b4f4bc9cb22875f97023984a625386f0c7cc1c0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2019 11:58:08 +0000 Subject: locking/futex: Allow low-level atomic operations to return -EAGAIN Some futex() operations, including FUTEX_WAKE_OP, require the kernel to perform an atomic read-modify-write of the futex word via the userspace mapping. These operations are implemented by each architecture in arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which are called in atomic context with the relevant hash bucket locks held. Although these routines may return -EFAULT in response to a page fault generated when accessing userspace, they are expected to succeed (i.e. return 0) in all other cases. This poses a problem for architectures that do not provide bounded forward progress guarantees or fairness of contended atomic operations and can lead to starvation in some cases. In these problematic scenarios, we must return back to the core futex code so that we can drop the hash bucket locks and reschedule if necessary, much like we do in the case of a page fault. Allow architectures to return -EAGAIN from their implementations of arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which will cause the core futex code to reschedule if necessary and return back to the architecture code later on. Cc: Acked-by: Peter Zijlstra (Intel) Signed-off-by: Will Deacon --- kernel/futex.c | 188 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 117 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9e40cf7be606..6262f1534ac9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { + int err; u32 uninitialized_var(curval); if (unlikely(should_fail_futex(true))) return -EFAULT; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; @@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { - ret = -EFAULT; - - } else if (curval != uval) { + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { /* * If a unconditional UNLOCK_PI operation (user space did not * try the TID->0 transition) raced with a waiter setting the @@ -1700,32 +1700,32 @@ retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ ret = op_ret; goto out_put_keys; } - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; + } - if (!(flags & FLAGS_SHARED)) + if (!(flags & FLAGS_SHARED)) { + cond_resched(); goto retry_private; + } put_futex_key(&key2); put_futex_key(&key1); + cond_resched(); goto retry; } @@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; u32 newtid; - int ret; + int ret, err = 0; lockdep_assert_held(q->lock_ptr); @@ -2421,14 +2421,17 @@ retry: if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; + err = get_futex_value_locked(&uval, uaddr); + if (err) + goto handle_err; for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + if (curval == uval) break; uval = curval; @@ -2456,23 +2459,37 @@ retry: return 0; /* - * To handle the page fault we need to drop the locks here. That gives - * the other task (either the highest priority waiter itself or the - * task which stole the rtmutex) the chance to try the fixup of the - * pi_state. So once we are back from handling the fault we need to - * check the pi_state after reacquiring the locks and before trying to - * do another fixup. When the fixup has been done already we simply - * return. + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. * * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely * drop hb->lock since the caller owns the hb -> futex_q relation. * Dropping the pi_mutex->wait_lock requires the state revalidate. */ -handle_fault: +handle_err: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); - ret = fault_in_user_writeable(uaddr); + switch (err) { + case -EFAULT: + ret = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + ret = 0; + break; + + default: + WARN_ON_ONCE(1); + ret = err; + break; + } spin_lock(q->lock_ptr); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -3041,10 +3058,8 @@ retry: * A unconditional UNLOCK_PI op raced against a waiter * setting the FUTEX_WAITERS bit. Try again. */ - if (ret == -EAGAIN) { - put_futex_key(&key); - goto retry; - } + if (ret == -EAGAIN) + goto pi_retry; /* * wake_futex_pi has detected invalid state. Tell user * space. @@ -3059,9 +3074,19 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { spin_unlock(&hb->lock); - goto pi_faulted; + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + goto out_putkey; + } } /* @@ -3075,6 +3100,11 @@ out_putkey: put_futex_key(&key); return ret; +pi_retry: + put_futex_key(&key); + cond_resched(); + goto retry; + pi_faulted: put_futex_key(&key); @@ -3435,6 +3465,7 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; + int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) @@ -3444,42 +3475,57 @@ retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; - } - if (nval != uval) + + case -EAGAIN: + cond_resched(); goto retry; - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + default: + WARN_ON_ONCE(1); + return err; + } } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; } -- cgit v1.2.3 From e43e9c339a78a0978f4ce473f645cedc05e6a57c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 13:51:03 -0400 Subject: fsnotify: switch send_to_group() and ->handle_event to const struct qstr * note that conditions surrounding accesses to dname in audit_watch_handle_event() and audit_mark_handle_event() guarantee that dname won't have been NULL. Signed-off-by: Al Viro --- kernel/audit_fsnotify.c | 4 ++-- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 37ae95cfb7f4..fb241805569c 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -164,7 +164,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); @@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { - if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index abfb112f26aa..e49c912f862d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie, + const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e8d1adeb2223..3c12fd5b680e 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -482,7 +482,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); @@ -507,9 +507,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); + audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); -- cgit v1.2.3 From 6921d4ebe418e7cce9f65c1f38c93ea82a1f546c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 14:09:49 -0400 Subject: audit_update_watch(): switch to const struct qstr * Signed-off-by: Al Viro --- kernel/audit_watch.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3c12fd5b680e..d832ce9df065 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -255,18 +255,19 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, + const struct qstr *dname, dev_t dev, unsigned long ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; struct audit_entry *oentry, *nentry; + const unsigned char *name = dname->name; mutex_lock(&audit_filter_mutex); /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, + if (audit_compare_dname_path(name, owatch->path, AUDIT_NAME_FULL)) continue; @@ -507,9 +508,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0); + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); -- cgit v1.2.3 From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: bpf: add writable context for raw tracepoints This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b0de49598341..ae141e745f92 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1789,12 +1789,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 423f242a5efb..2ef442c62c0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -405,6 +405,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -1993,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2137,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91800be0c8eb..8607aba1d882 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -915,6 +915,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1204,6 +1225,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } -- cgit v1.2.3 From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae141e745f92..ad3ccf82f31d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ef442c62c0e..271717246af3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2543,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2578,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2635,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2784,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2847,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } -- cgit v1.2.3 From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:06 +0200 Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most netlink based interfaces (including recently added ones) are still not setting it in kernel generated messages. Without the flag, message parsers not aware of attribute semantics (e.g. wireshark dissector or libmnl's mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display the structure of their contents. Unfortunately we cannot just add the flag everywhere as there may be userspace applications which check nlattr::nla_type directly rather than through a helper masking out the flags. Therefore the patch renames nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start() as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually are rewritten to use nla_nest_start(). Except for changes in include/net/netlink.h, the patch was generated using this semantic patch: @@ expression E1, E2; @@ -nla_nest_start(E1, E2) +nla_nest_start_noflag(E1, E2) @@ expression E1, E2; @@ -nla_nest_start_noflag(E1, E2 | NLA_F_NESTED) +nla_nest_start(E1, E2) Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 1b942a7caf26..ef4f9cd980fd 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -375,7 +375,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - na = nla_nest_start(skb, aggr); + na = nla_nest_start_noflag(skb, aggr); if (!na) goto err; -- cgit v1.2.3 From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:28 +0200 Subject: netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ef4f9cd980fd..0e347f1c7800 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -677,8 +677,9 @@ static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, return -EINVAL; } - return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX, - policy, info->extack); + return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, + TASKSTATS_CMD_ATTR_MAX, policy, + info->extack); } static struct genl_family family __ro_after_init = { -- cgit v1.2.3 From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:31 +0200 Subject: genetlink: optionally validate strictly/dumps Add options to strictly validate messages and dump messages, sometimes perhaps validating dump messages non-strictly may be required, so add an option for that as well. Since none of this can really be applied to existing commands, set the options everwhere using the following spatch: @@ identifier ops; expression X; @@ struct genl_ops ops[] = { ..., { .cmd = X, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ... }, ... }; For new commands one should just not copy the .validate 'opt-out' flags and thus get strict validation. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0e347f1c7800..5f852b8f59f7 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -649,12 +649,14 @@ err: static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, /* policy enforced later */ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, /* policy enforced later */ .flags = GENL_CMD_CAP_HASPOL, -- cgit v1.2.3 From 795d673af1afae8146ac3070a2d77cfae5287c43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 14:11:05 -0400 Subject: audit_compare_dname_path(): switch to const struct qstr * Signed-off-by: Al Viro --- kernel/audit.h | 2 +- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 3 +-- kernel/auditfilter.c | 6 +++--- kernel/auditsc.c | 4 ++-- 5 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 958d5b8fc1b3..2071725a999f 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -231,7 +231,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, int plen); +extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen); extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index fb241805569c..b5737b826951 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { - if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL)) + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d832ce9df065..b50c574223fa 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -261,13 +261,12 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; struct audit_entry *oentry, *nentry; - const unsigned char *name = dname->name; mutex_lock(&audit_filter_mutex); /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(name, owatch->path, + if (audit_compare_dname_path(dname, owatch->path, AUDIT_NAME_FULL)) continue; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..f9fff93c3351 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,12 +1290,12 @@ int parent_len(const char *path) * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ -int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; - dlen = strlen(dname); + dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; @@ -1306,7 +1306,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) p = path + parentlen; - return strncmp(p, dname, dlen); + return strncmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..92d0ae63febd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2045,7 +2045,7 @@ void __audit_inode_child(struct inode *parent, { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); - const char *dname = dentry->d_name.name; + const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; @@ -2099,7 +2099,7 @@ void __audit_inode_child(struct inode *parent, (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (!strcmp(dname, n->name->name) || + if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : -- cgit v1.2.3 From 9b019acb72e4b5741d88e8936d6f200ed44b66b2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 Apr 2019 14:26:13 +1000 Subject: sched/nohz: Run NOHZ idle load balancer on HK_FLAG_MISC CPUs The NOHZ idle balancer runs on the lowest idle CPU. This can interfere with isolated CPUs, so confine it to HK_FLAG_MISC housekeeping CPUs. HK_FLAG_SCHED is not used for this because it is not set anywhere at the moment. This could be folded into HK_FLAG_SCHED once that option is fixed. The problem was observed with increased jitter on an application running on CPU0, caused by NOHZ idle load balancing being run on CPU1 (an SMT sibling). Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190412042613.28930-1-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13bafe350abf..7b0da7007da3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9519,22 +9519,26 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * anywhere yet. */ static inline int find_new_ilb(void) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int ilb; - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + housekeeping_cpumask(HK_FLAG_MISC)) { + if (idle_cpu(ilb)) + return ilb; + } return nr_cpu_ids; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { -- cgit v1.2.3 From 948f83768a180ec8e85c4a8ff269d5e433d10815 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Apr 2019 18:02:44 +0200 Subject: locking/lockdep: Test all incompatible scenarios at once in check_irq_usage() check_prev_add_irq() tests all incompatible scenarios one after the other while adding a lock (@next) to a tree dependency (@prev): LOCK_USED_IN_HARDIRQ vs LOCK_ENABLED_HARDIRQ LOCK_USED_IN_HARDIRQ_READ vs LOCK_ENABLED_HARDIRQ LOCK_USED_IN_SOFTIRQ vs LOCK_ENABLED_SOFTIRQ LOCK_USED_IN_SOFTIRQ_READ vs LOCK_ENABLED_SOFTIRQ Also for these four scenarios, we must at least iterate the @prev backward dependency. Then if it matches the relevant LOCK_USED_* bit, we must also iterate the @next forward dependency. Therefore in the best case we iterate 4 times, in the worst case 8 times. A different approach can let us divide the number of branch iterations by 4: 1) Iterate through @prev backward dependencies and accumulate all the IRQ uses in a single mask. In the best case where the current lock hasn't been used in IRQ, we stop here. 2) Iterate through @next forward dependencies and try to find a lock whose usage is exclusive to the accumulated usages gathered in the previous step. If we find one (call it @lockA), we have found an incompatible use, otherwise we stop here. Only bad locking scenario go further. So a sane verification stop here. 3) Iterate again through @prev backward dependency and find the lock whose usage matches @lockA in term of incompatibility. Call that lock @lockB. 4) Report the incompatible usages of @lockA and @lockB If no incompatible use is found, the verification never goes beyond step 2 which means at most two iterations. The following compares the execution measurements of the function check_prev_add_irq(): Number of calls | Avg (ns) | Stdev (ns) | Total time (ns) ------------------------------------------------------------------------ Mainline 8452 | 2652 | 11962 | 22415143 This patch 8452 | 1518 | 7090 | 12835602 Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190402160244.32434-5-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 228 ++++++++++++++++++++++++++----------- kernel/locking/lockdep_internals.h | 6 + 2 files changed, 167 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5e149dd78298..25ecc6d3058b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1676,6 +1676,14 @@ check_redundant(struct lock_list *root, struct lock_class *target, } #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ + *(unsigned long *)mask |= entry->class->usage_mask; + + return 0; +} + /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1687,8 +1695,6 @@ static inline int usage_match(struct lock_list *entry, void *mask) return entry->class->usage_mask & *(unsigned long *)mask; } - - /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -1922,39 +1928,6 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; } -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, lock_flag(bit_backwards), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, lock_flag(bit_forwards), &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); -} - static const char *state_names[] = { #define LOCKDEP_STATE(__STATE) \ __stringify(__STATE), @@ -1977,6 +1950,13 @@ static inline const char *state_name(enum lock_usage_bit bit) return state_names[bit >> LOCK_USAGE_DIR_MASK]; } +/* + * The bit number is encoded like: + * + * bit0: 0 exclusive, 1 read lock + * bit1: 0 used in irq, 1 irq enabled + * bit2-n: state + */ static int exclusive_bit(int new_bit) { int state = new_bit & LOCK_USAGE_STATE_MASK; @@ -1988,45 +1968,160 @@ static int exclusive_bit(int new_bit) return state | (dir ^ LOCK_USAGE_DIR_MASK); } +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ + unsigned long excl = 0; + + /* Invert dir */ + excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; + excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + + return excl; +} + +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Strip read */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl &= ~LOCKF_IRQ_READ; + + return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, + unsigned long excl_mask, + enum lock_usage_bit *bitp, + enum lock_usage_bit *excl_bitp) +{ + int bit, excl; + + for_each_set_bit(bit, &mask, LOCK_USED) { + excl = exclusive_bit(bit); + if (excl_mask & lock_flag(excl)) { + *bitp = bit; + *excl_bitp = excl; + return 0; + } + } + return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) + struct held_lock *next) { + unsigned long usage_mask = 0, forward_mask, backward_mask; + enum lock_usage_bit forward_bit = 0, backward_bit = 0; + struct lock_list *uninitialized_var(target_entry1); + struct lock_list *uninitialized_var(target_entry); + struct lock_list this, that; + int ret; + /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : + * Step 1: gather all hard/soft IRQs usages backward in an + * accumulated usage mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + this.parent = NULL; + this.class = hlock_class(prev); - bit++; /* _READ */ + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + if (ret < 0) + return print_bfs_bug(ret); + + usage_mask &= LOCKF_USED_IN_IRQ_ALL; + if (!usage_mask) + return 1; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : + * Step 2: find exclusive uses forward that match the previous + * backward accumulated mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + forward_mask = exclusive_mask(usage_mask); - return 1; -} + that.parent = NULL; + that.class = hlock_class(next); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE + ret = find_usage_forwards(&that, forward_mask, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; - return 1; + /* + * Step 3: we found a bad match! Now retrieve a lock from the backward + * list whose usage mask matches the exclusive usage mask from the + * lock found on the forward list. + */ + backward_mask = original_mask(target_entry1->class->usage_mask); + + ret = find_usage_backwards(&this, backward_mask, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (DEBUG_LOCKS_WARN_ON(ret == 1)) + return 1; + + /* + * Step 4: narrow down to a pair of incompatible usage bits + * and report it. + */ + ret = find_exclusive_match(target_entry->class->usage_mask, + target_entry1->class->usage_mask, + &backward_bit, &forward_bit); + if (DEBUG_LOCKS_WARN_ON(ret == -1)) + return 1; + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); } static void inc_chains(void) @@ -2043,9 +2138,8 @@ static void inc_chains(void) #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, + struct held_lock *prev, struct held_lock *next) { return 1; } @@ -2225,7 +2319,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, else if (unlikely(ret < 0)) return print_bfs_bug(ret); - if (!check_prev_add_irq(curr, prev, next)) + if (!check_irq_usage(curr, prev, next)) return 0; /* diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 2b3ffd4117ad..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -66,6 +66,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = 0; #undef LOCKDEP_STATE +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) + +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) + /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, * .data and .bss to fit in required 32MB limit for the kernel. With -- cgit v1.2.3 From ad282a8117d5048398f506f20b092c14b3b3c43f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2019 17:08:52 -0700 Subject: locking/static_key: Add support for deferred static branches Add deferred static branches. We can't unfortunately use the nice trick of encapsulating the entire structure in true/false variants, because the inside has to be either struct static_key_true or struct static_key_false. Use defines to pass the appropriate members to the helpers separately. Signed-off-by: Jakub Kicinski Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Simon Horman Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: alexei.starovoitov@gmail.com Cc: ard.biesheuvel@linaro.org Cc: oss-drivers@netronome.com Cc: yamada.masahiro@socionext.com Link: https://lkml.kernel.org/r/20190330000854.30142-2-jakub.kicinski@netronome.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a799b1ac6b2f..73bbbaddbd9c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -244,12 +244,13 @@ static void __static_key_slow_dec(struct static_key *key, cpus_read_unlock(); } -static void jump_label_update_timeout(struct work_struct *work) +void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key, 0, NULL); } +EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { @@ -264,19 +265,21 @@ void static_key_slow_dec_cpuslocked(struct static_key *key) __static_key_slow_dec_cpuslocked(key, 0, NULL); } -void static_key_slow_dec_deferred(struct static_key_deferred *key) +void __static_key_slow_dec_deferred(struct static_key *key, + struct delayed_work *work, + unsigned long timeout) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(key, timeout, work); } -EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -void static_key_deferred_flush(struct static_key_deferred *key) +void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); - flush_delayed_work(&key->work); + flush_delayed_work(work); } -EXPORT_SYMBOL_GPL(static_key_deferred_flush); +EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) -- cgit v1.2.3 From b92e793bbe4a1c49dbf78d8d526561e7a7dd568a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2019 17:08:53 -0700 Subject: locking/static_key: Factor out the fast path of static_key_slow_dec() static_key_slow_dec() checks if the atomic enable count is larger than 1, and if so there decrements it before taking the jump_label_lock. Move this logic into a helper for reuse in rate limitted keys. Signed-off-by: Jakub Kicinski Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Simon Horman Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: alexei.starovoitov@gmail.com Cc: ard.biesheuvel@linaro.org Cc: oss-drivers@netronome.com Cc: yamada.masahiro@socionext.com Link: https://lkml.kernel.org/r/20190330000854.30142-3-jakub.kicinski@netronome.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 73bbbaddbd9c..02c3d11264dd 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -202,13 +202,13 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec_cpuslocked(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static bool static_key_slow_try_dec(struct static_key *key) { int val; - lockdep_assert_cpus_held(); + val = atomic_fetch_add_unless(&key->enabled, -1, 1); + if (val == 1) + return false; /* * The negative count check is valid even when a negative @@ -217,11 +217,18 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, * returns is unbalanced, because all other static_key_slow_inc() * instances block while the update is in progress. */ - val = atomic_fetch_add_unless(&key->enabled, -1, 1); - if (val != 1) { - WARN(val < 0, "jump label: negative count!\n"); + WARN(val < 0, "jump label: negative count!\n"); + return true; +} + +static void __static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ + lockdep_assert_cpus_held(); + + if (static_key_slow_try_dec(key)) return; - } jump_label_lock(); if (atomic_dec_and_test(&key->enabled)) { -- cgit v1.2.3 From 94b5f312cfb4a66055d9b688dc9ab6b297eb9dcc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2019 17:08:54 -0700 Subject: locking/static_key: Don't take sleeping locks in __static_key_slow_dec_deferred() Changing jump_label state is protected by jump_label_lock(). Rate limited static_key_slow_dec(), however, will never directly call jump_label_update(), it will schedule a delayed work instead. Therefore it's unnecessary to take both the cpus_read_lock() and jump_label_lock(). This allows static_key_slow_dec_deferred() to be called from atomic contexts, like socket destructing in net/tls, without the need for another indirection. Signed-off-by: Jakub Kicinski Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Simon Horman Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: alexei.starovoitov@gmail.com Cc: ard.biesheuvel@linaro.org Cc: oss-drivers@netronome.com Cc: yamada.masahiro@socionext.com Link: https://lkml.kernel.org/r/20190330000854.30142-4-jakub.kicinski@netronome.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 02c3d11264dd..de6efdecc70d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -221,9 +221,7 @@ static bool static_key_slow_try_dec(struct static_key *key) return true; } -static void __static_key_slow_dec_cpuslocked(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); @@ -231,23 +229,15 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, return; jump_label_lock(); - if (atomic_dec_and_test(&key->enabled)) { - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else { - jump_label_update(key); - } - } + if (atomic_dec_and_test(&key->enabled)) + jump_label_update(key); jump_label_unlock(); } -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); - __static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } @@ -255,21 +245,21 @@ void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); - __static_key_slow_dec(&key->key, 0, NULL); + __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(key, 0, NULL); + __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec_cpuslocked(key, 0, NULL); + __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, @@ -277,7 +267,11 @@ void __static_key_slow_dec_deferred(struct static_key *key, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(key, timeout, work); + + if (static_key_slow_try_dec(key)) + return; + + schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -- cgit v1.2.3 From 7a5da02de8d6eafba99556f8c98e5313edebb449 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 18 Apr 2019 16:24:50 +0200 Subject: locking/lockdep: check for freed initmem in static_obj() The following warning occurred on s390: WARNING: CPU: 0 PID: 804 at kernel/locking/lockdep.c:1025 lockdep_register_key+0x30/0x150 This is because the check in static_obj() assumes that all memory within [_stext, _end] belongs to static objects, which at least for s390 isn't true. The init section is also part of this range, and freeing it allows the buddy allocator to allocate memory from it. We have virt == phys for the kernel on s390, so that such allocations would then have addresses within the range [_stext, _end]. To fix this, introduce arch_is_kernel_initmem_freed(), similar to arch_is_kernel_text/data(), and add it to the checks in static_obj(). This will always return 0 on architectures that do not define arch_is_kernel_initmem_freed. On s390, it will return 1 if initmem has been freed and the address is in the range [__init_begin, __init_end]. Signed-off-by: Gerald Schaefer Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- kernel/locking/lockdep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34cdcbedda49..22a99530983e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -649,6 +649,9 @@ static int static_obj(const void *obj) end = (unsigned long) &_end, addr = (unsigned long) obj; + if (arch_is_kernel_initmem_freed(addr)) + return 0; + /* * static variable? */ -- cgit v1.2.3 From 3d9a8072915366b5932beeed97f158f8d4955768 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:54 +0200 Subject: tracing: Cleanup stack trace code - Remove the extra array member of stack_dump_trace[] along with the ARRAY_SIZE - 1 initialization for struct stack_trace :: max_entries. Both are historical leftovers of no value. The stack tracer never exceeds the array and there is no extra storage requirement either. - Make variables which are only used in trace_stack.c static. - Simplify the enable/disable logic. - Rename stack_trace_print() as it's using the stack_trace_ namespace. Free the name up for stack trace related functions. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.230654524@linutronix.de --- kernel/trace/trace_stack.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c6e54ff25cae..4efda5f75a0f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,30 +18,26 @@ #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES + 1]; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ struct stack_trace stack_trace_max = { - .max_entries = STACK_TRACE_ENTRIES - 1, + .max_entries = STACK_TRACE_ENTRIES, .entries = &stack_dump_trace[0], }; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void) { long i; int size; @@ -61,16 +57,7 @@ void stack_trace_print(void) } } -/* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - * stack_trace_index[] - * stack_trace_max - * stack_trace_max_size - */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; @@ -179,7 +166,7 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max.nr_entries = x; if (task_stack_end_corrupted(current)) { - stack_trace_print(); + print_max_stack(); BUG(); } @@ -412,23 +399,21 @@ stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int was_enabled; int ret; mutex_lock(&stack_sysctl_mutex); + was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || - (last_stack_tracer_enabled == !!stack_tracer_enabled)) + if (ret || !write || (was_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = !!stack_tracer_enabled; - if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: mutex_unlock(&stack_sysctl_mutex); return ret; @@ -444,7 +429,6 @@ static __init int enable_stacktrace(char *str) strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; - last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); -- cgit v1.2.3 From e9b98e162aa53cbea7c8b0d6c9d5dc6e0f822b9c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:55 +0200 Subject: stacktrace: Provide helpers for common stack trace operations All operations with stack traces are based on struct stack_trace. That's a horrible construct as the struct is a kitchen sink for input and output. Quite some usage sites embed it into their own data structures which creates weird indirections. There is absolutely no point in doing so. For all use cases a storage array and the number of valid stack trace entries in the array is sufficient. Provide helper functions which avoid the struct stack_trace indirection so the usage sites can be cleaned up. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.324810708@linutronix.de --- kernel/stacktrace.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..b38333b3bc18 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -11,35 +11,54 @@ #include #include -void print_stack_trace(struct stack_trace *trace, int spaces) +/** + * stack_trace_print - Print the entries in the stack trace + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + */ +void stack_trace_print(unsigned long *entries, unsigned int nr_entries, + int spaces) { - int i; + unsigned int i; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return; - for (i = 0; i < trace->nr_entries; i++) - printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); + for (i = 0; i < nr_entries; i++) + printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]); +} +EXPORT_SYMBOL_GPL(stack_trace_print); + +void print_stack_trace(struct stack_trace *trace, int spaces) +{ + stack_trace_print(trace->entries, trace->nr_entries, spaces); } EXPORT_SYMBOL_GPL(print_stack_trace); -int snprint_stack_trace(char *buf, size_t size, - struct stack_trace *trace, int spaces) +/** + * stack_trace_snprint - Print the entries in the stack trace into a buffer + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, + unsigned int nr_entries, int spaces) { - int i; - int generated; - int total = 0; + unsigned int generated, i, total = 0; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return 0; - for (i = 0; i < trace->nr_entries; i++) { + for (i = 0; i < nr_entries && size; i++) { generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', - (void *)trace->entries[i]); + (void *)entries[i]); total += generated; - - /* Assume that generated isn't a negative number */ if (generated >= size) { buf += size; size = 0; @@ -51,6 +70,14 @@ int snprint_stack_trace(char *buf, size_t size, return total; } +EXPORT_SYMBOL_GPL(stack_trace_snprint); + +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + return stack_trace_snprint(buf, size, trace->entries, + trace->nr_entries, spaces); +} EXPORT_SYMBOL_GPL(snprint_stack_trace); /* @@ -77,3 +104,116 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); return -ENOSYS; } + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace(&trace); + return trace.nr_entries; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace_tsk(task, &trace); + return trace.nr_entries; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr, + }; + + save_stack_trace_regs(regs, &trace); + return trace.nr_entries; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + int ret = save_stack_trace_tsk_reliable(tsk, &trace); + + return ret ? ret : trace.nr_entries; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + + save_stack_trace_user(&trace); + return trace.nr_entries; +} +#endif /* CONFIG_USER_STACKTRACE_SUPPORT */ -- cgit v1.2.3 From 1b59562d3ab09dcef188eb8055d05f0336380394 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:57 +0200 Subject: backtrace-test: Simplify stack trace handling Replace the indirection through struct stack_trace by using the storage array based interfaces. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.501919093@linutronix.de --- kernel/backtracetest.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a563c8fdad0d 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -48,19 +48,14 @@ static void backtrace_test_irq(void) #ifdef CONFIG_STACKTRACE static void backtrace_test_saved(void) { - struct stack_trace trace; unsigned long entries[8]; + unsigned int nr_entries; pr_info("Testing a saved backtrace.\n"); pr_info("The following trace is a kernel self test and not a bug!\n"); - trace.nr_entries = 0; - trace.max_entries = ARRAY_SIZE(entries); - trace.entries = entries; - trace.skip = 0; - - save_stack_trace(&trace); - print_stack_trace(&trace, 0); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + stack_trace_print(entries, nr_entries, 0); } #else static void backtrace_test_saved(void) -- cgit v1.2.3 From f93877214a83e88373b20801c2d671923d03d07d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:59 +0200 Subject: latency_top: Simplify stack trace handling Replace the indirection through struct stack_trace with an invocation of the storage array based interface. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.683039030@linutronix.de --- kernel/latencytop.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index f5a90ab3c6b9..99a5b5f46dc5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk, memcpy(&latency_record[i], lat, sizeof(struct latency_record)); } -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, - struct latency_record *lat) -{ - struct stack_trace trace; - - memset(&trace, 0, sizeof(trace)); - trace.max_entries = LT_BACKTRACEDEPTH; - trace.entries = &lat->backtrace[0]; - save_stack_trace_tsk(tsk, &trace); -} - /** * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency @@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.count = 1; lat.time = usecs; lat.max = usecs; - store_stacktrace(tsk, &lat); + + stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0); raw_spin_lock_irqsave(&latency_lock, flags); -- cgit v1.2.3 From 746017ed8d4d3c2070bb03aee9536f24da43c778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:05 +0200 Subject: dma/debug: Simplify stracktrace retrieval Replace the indirection through struct stack_trace with an invocation of the storage array based interface. Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.248658135@linutronix.de --- kernel/dma/debug.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a218e43cc382..badd77670d00 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -89,8 +89,8 @@ struct dma_debug_entry { int sg_mapped_ents; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE - struct stack_trace stacktrace; - unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; + unsigned int stack_len; + unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif }; @@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) #ifdef CONFIG_STACKTRACE if (entry) { pr_warning("Mapped at:\n"); - print_stack_trace(&entry->stacktrace, 0); + stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif } @@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_unlock_irqrestore(&free_entries_lock, flags); #ifdef CONFIG_STACKTRACE - entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; - entry->stacktrace.entries = entry->st_entries; - entry->stacktrace.skip = 1; - save_stack_trace(&entry->stacktrace); + entry->stack_len = stack_trace_save(entry->stack_entries, + ARRAY_SIZE(entry->stack_entries), + 1); #endif - return entry; } -- cgit v1.2.3 From b1abe4622d4cc32b3b37cfefbc7ac070a8f868e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:10 +0200 Subject: lockdep: Remove unused trace argument from print_circular_bug() Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.716274532@linutronix.de --- kernel/locking/lockdep.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2edf9501d906..d7615d299d08 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1522,10 +1522,9 @@ static inline int class_equal(struct lock_list *entry, void *data) } static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt, - struct stack_trace *trace) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -2206,7 +2205,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ save(trace); } - return print_circular_bug(&this, target_entry, next, prev, trace); + return print_circular_bug(&this, target_entry, next, prev); } else if (unlikely(ret < 0)) return print_bfs_bug(ret); -- cgit v1.2.3 From 76b14436b4d98903fef723365170bedd6f28ab2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:11 +0200 Subject: lockdep: Remove save argument from check_prev_add() There is only one caller which hands in save_trace as function pointer. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.803362058@linutronix.de --- kernel/locking/lockdep.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d7615d299d08..3603893d5bbd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2158,8 +2158,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct stack_trace *trace, - int (*save)(struct stack_trace *trace)) + struct held_lock *next, int distance, struct stack_trace *trace) { struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; @@ -2199,11 +2198,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (unlikely(!ret)) { if (!trace->entries) { /* - * If @save fails here, the printing might trigger - * a WARN but because of the !nr_entries it should - * not do bad things. + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. */ - save(trace); + save_trace(trace); } return print_circular_bug(&this, target_entry, next, prev); } @@ -2253,7 +2252,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (!trace->entries && !save(trace)) + if (!trace->entries && !save_trace(trace)) return 0; /* @@ -2318,7 +2317,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + int ret = check_prev_add(curr, hlock, next, distance, + &trace); if (!ret) return 0; -- cgit v1.2.3 From c120bce78065cbea460a58b1572c215db9c148ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:12 +0200 Subject: lockdep: Simplify stack trace handling Replace the indirection through struct stack_trace by using the storage array based interfaces and storing the information is a small lockdep specific data structure. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.891724020@linutronix.de --- kernel/locking/lockdep.c | 55 ++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3603893d5bbd..45bcaf2e4cb6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -434,18 +434,14 @@ static void print_lockdep_off(const char *bug_msg) #endif } -static int save_trace(struct stack_trace *trace) +static int save_trace(struct lock_trace *trace) { - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - trace->max_entries = trace->nr_entries; + unsigned long *entries = stack_trace + nr_stack_trace_entries; + unsigned int max_entries; + trace->offset = nr_stack_trace_entries; + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->nr_entries = stack_trace_save(entries, max_entries, 3); nr_stack_trace_entries += trace->nr_entries; if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -1196,7 +1192,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct stack_trace *trace) + struct lock_trace *trace) { struct lock_list *entry; /* @@ -1415,6 +1411,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry, * checking. */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ + unsigned long *entries = stack_trace + trace->offset; + + stack_trace_print(entries, trace->nr_entries, spaces); +} + /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): @@ -1427,8 +1430,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_stack_trace(&target->trace, 6); - + print_lock_trace(&target->trace, 6); return 0; } @@ -1740,7 +1742,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_stack_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); @@ -1765,7 +1767,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); + print_lock_trace(&entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1878,14 +1880,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2158,7 +2160,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct stack_trace *trace) + struct held_lock *next, int distance, struct lock_trace *trace) { struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; @@ -2196,7 +2198,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) { - if (!trace->entries) { + if (!trace->nr_entries) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it @@ -2252,7 +2254,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (!trace->entries && !save_trace(trace)) + if (!trace->nr_entries && !save_trace(trace)) return 0; /* @@ -2284,14 +2286,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { + struct lock_trace trace = { .nr_entries = 0 }; int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace = { - .nr_entries = 0, - .max_entries = 0, - .entries = NULL, - .skip = 0, - }; /* * Debugging checks. @@ -2719,6 +2716,10 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +} #endif /* @@ -2815,7 +2816,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); -- cgit v1.2.3 From e7d916632b528e8cccc8e9ccca81acfc591a5fde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:13 +0200 Subject: tracing: Simplify stacktrace retrieval in histograms The indirection through struct stack_trace is not necessary at all. Use the storage array based interface. Signed-off-by: Thomas Gleixner Tested-by: Tom Zanussi Reviewed-by: Tom Zanussi Reviewed-by: Josh Poimboeuf Acked-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.979089273@linutronix.de --- kernel/trace/trace_events_hist.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21ceae299f7e..a1d20421f4b0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5186,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; - struct stack_trace stacktrace; struct hist_field *key_field; u64 field_contents; void *key = NULL; @@ -5198,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - stacktrace.max_entries = HIST_STACKTRACE_DEPTH; - stacktrace.entries = entries; - stacktrace.nr_entries = 0; - stacktrace.skip = HIST_STACKTRACE_SKIP; - - memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); - save_stack_trace(&stacktrace); - + memset(entries, 0, HIST_STACKTRACE_SIZE); + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); key = entries; } else { field_contents = key_field->fn(key_field, elt, rbe, rec); -- cgit v1.2.3 From 2a820bf74918d61ea54f7c1001f4a6a2e457577c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:14 +0200 Subject: tracing: Use percpu stack trace buffer more intelligently The per cpu stack trace buffer usage pattern is odd at best. The buffer has place for 512 stack trace entries on 64-bit and 1024 on 32-bit. When interrupts or exceptions nest after the per cpu buffer was acquired the stacktrace length is hardcoded to 8 entries. 512/1024 stack trace entries in kernel stacks are unrealistic so the buffer is a complete waste. Split the buffer into 4 nest levels, which are 128/256 entries per level. This allows nesting contexts (interrupts, exceptions) to utilize the cpu buffer for stack retrieval and avoids the fixed length allocation along with the conditional execution pathes. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.066064076@linutronix.de --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..4fc93004feab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2749,12 +2749,21 @@ trace_function(struct trace_array *tr, #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING 4 + +#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) + struct ftrace_stack { - unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; + unsigned long calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { + struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -2763,10 +2772,11 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; + struct ftrace_stack *fstack; struct stack_entry *entry; struct stack_trace trace; - int use_stack; - int size = FTRACE_STACK_ENTRIES; + int size = FTRACE_KSTACK_ENTRIES; + int stackidx; trace.nr_entries = 0; trace.skip = skip; @@ -2788,29 +2798,32 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = __this_cpu_inc_return(ftrace_stack_reserve); + stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + + /* This should never happen. If it does, yell once and skip */ + if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + goto out; + /* - * We don't need any atomic variables, just a barrier. - * If an interrupt comes in, we don't care, because it would - * have exited and put the counter back to what we want. - * We just need a barrier to keep gcc from moving things - * around. + * The above __this_cpu_inc_return() is 'atomic' cpu local. An + * interrupt will either see the value pre increment or post + * increment. If the interrupt happens pre increment it will have + * restored the counter when it returns. We just need a barrier to + * keep gcc from moving things around. */ barrier(); - if (use_stack == 1) { - trace.entries = this_cpu_ptr(ftrace_stack.calls); - trace.max_entries = FTRACE_STACK_MAX_ENTRIES; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); + fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; + trace.entries = fstack->calls; + trace.max_entries = FTRACE_KSTACK_ENTRIES; - if (trace.nr_entries > size) - size = trace.nr_entries; - } else - /* From now on, use_stack is a boolean */ - use_stack = 0; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; size *= sizeof(unsigned long); @@ -2820,19 +2833,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, goto out; entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, size); - - if (use_stack) - memcpy(&entry->caller, trace.entries, - trace.nr_entries * sizeof(unsigned long)); - else { - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.entries = entry->caller; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - } + memcpy(&entry->caller, trace.entries, size); entry->size = trace.nr_entries; -- cgit v1.2.3 From c438f140cc16d47fac808d893f5017f6d641cb46 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:15 +0200 Subject: tracing: Make ftrace_trace_userstack() static and conditional It's only used in trace.c and there is absolutely no point in compiling it in when user space stack traces are not supported. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.162400595@linutronix.de --- kernel/trace/trace.c | 14 ++++++++------ kernel/trace/trace.h | 8 -------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4fc93004feab..d8369d27c1af 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -2905,9 +2907,10 @@ void trace_dump_stack(int skip) } EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); -void +static void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; @@ -2958,13 +2961,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) out: preempt_enable(); } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) { - ftrace_trace_userstack(tr, flags, preempt_count()); } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..639047b259d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, - int pc); - void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_userstack(struct ring_buffer *buffer, - unsigned long flags, int pc) -{ -} - static inline void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { -- cgit v1.2.3 From ee6dd0db4d8de41a0a0bc37d8d87a0b1623f83b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:16 +0200 Subject: tracing: Simplify stack trace retrieval Replace the indirection through struct stack_trace by using the storage array based interfaces. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.248604594@linutronix.de --- kernel/trace/trace.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d8369d27c1af..0ce8515dd470 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2774,22 +2774,18 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; + unsigned int size, nr_entries; struct ftrace_stack *fstack; struct stack_entry *entry; - struct stack_trace trace; - int size = FTRACE_KSTACK_ENTRIES; int stackidx; - trace.nr_entries = 0; - trace.skip = skip; - /* * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ #ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip++; + skip++; #endif /* @@ -2816,28 +2812,24 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, barrier(); fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; - trace.entries = fstack->calls; - trace.max_entries = FTRACE_KSTACK_ENTRIES; - - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - - if (trace.nr_entries > size) - size = trace.nr_entries; + size = ARRAY_SIZE(fstack->calls); - size *= sizeof(unsigned long); + if (regs) { + nr_entries = stack_trace_save_regs(regs, fstack->calls, + size, skip); + } else { + nr_entries = stack_trace_save(fstack->calls, size, skip); + } + size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); - memcpy(&entry->caller, trace.entries, size); - - entry->size = trace.nr_entries; + memcpy(&entry->caller, fstack->calls, size); + entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -2916,7 +2908,6 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - struct stack_trace trace; if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; @@ -2947,12 +2938,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = 0; - trace.entries = entry->caller; - - save_stack_trace_user(&trace); + stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); -- cgit v1.2.3 From 9f50c91b1195dfffd183d5d8505e45af86623532 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:17 +0200 Subject: tracing: Remove the last struct stack_trace usage Simplify the stack retrieval code by using the storage array based interface. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.340000461@linutronix.de --- kernel/trace/trace_stack.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4efda5f75a0f..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -23,11 +23,7 @@ static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -struct stack_trace stack_trace_max = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = &stack_dump_trace[0], -}; - +static unsigned int stack_trace_nr_entries; static unsigned long stack_trace_max_size; static arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -44,10 +40,10 @@ static void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); - for (i = 0; i < stack_trace_max.nr_entries; i++) { - if (i + 1 == stack_trace_max.nr_entries) + for (i = 0; i < stack_trace_nr_entries; i++) { + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -93,13 +89,12 @@ static void check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; - stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 0; - - save_stack_trace(&stack_trace_max); + stack_trace_nr_entries = stack_trace_save(stack_dump_trace, + ARRAY_SIZE(stack_dump_trace) - 1, + 0); /* Skip over the overhead of the stack tracer itself */ - for (i = 0; i < stack_trace_max.nr_entries; i++) { + for (i = 0; i < stack_trace_nr_entries; i++) { if (stack_dump_trace[i] == ip) break; } @@ -108,7 +103,7 @@ static void check_stack(unsigned long ip, unsigned long *stack) * Some archs may not have the passed in ip in the dump. * If that happens, we need to show everything. */ - if (i == stack_trace_max.nr_entries) + if (i == stack_trace_nr_entries) i = 0; /* @@ -126,13 +121,13 @@ static void check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < stack_trace_max.nr_entries) { + while (i < stack_trace_nr_entries) { int found = 0; stack_trace_index[x] = this_size; p = start; - for (; p < top && i < stack_trace_max.nr_entries; p++) { + for (; p < top && i < stack_trace_nr_entries; p++) { /* * The READ_ONCE_NOCHECK is used to let KASAN know that * this is not a stack-out-of-bounds error. @@ -163,7 +158,7 @@ static void check_stack(unsigned long ip, unsigned long *stack) i++; } - stack_trace_max.nr_entries = x; + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { print_max_stack(); @@ -265,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= stack_trace_max.nr_entries) + if (n >= stack_trace_nr_entries) return NULL; m->private = (void *)n; @@ -329,7 +324,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); @@ -339,10 +334,10 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= stack_trace_max.nr_entries) + if (i >= stack_trace_nr_entries) return 0; - if (i + 1 == stack_trace_max.nr_entries) + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; -- cgit v1.2.3 From 25e39e32b0a3f99b9db320605f20f91d425b6a65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:18 +0200 Subject: livepatch: Simplify stack trace retrieval Replace the indirection through struct stack_trace by using the storage array based interfaces. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Miroslav Benes Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.437950229@linutronix.de --- kernel/livepatch/transition.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 9c89ae8b337a..c53370d596be 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task) * Determine whether the given stack trace includes any references to a * to-be-patched or to-be-unpatched function. */ -static int klp_check_stack_func(struct klp_func *func, - struct stack_trace *trace) +static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, + unsigned int nr_entries) { unsigned long func_addr, func_size, address; struct klp_ops *ops; int i; - for (i = 0; i < trace->nr_entries; i++) { - address = trace->entries[i]; + for (i = 0; i < nr_entries; i++) { + address = entries[i]; if (klp_target_state == KLP_UNPATCHED) { /* @@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func, static int klp_check_stack(struct task_struct *task, char *err_buf) { static unsigned long entries[MAX_STACK_ENTRIES]; - struct stack_trace trace; struct klp_object *obj; struct klp_func *func; - int ret; + int ret, nr_entries; - trace.skip = 0; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_ENTRIES; - trace.entries = entries; - ret = save_stack_trace_tsk_reliable(task, &trace); + ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); WARN_ON_ONCE(ret == -ENOSYS); - if (ret) { + if (ret < 0) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d has an unreliable stack\n", __func__, task->comm, task->pid); return ret; } + nr_entries = ret; klp_for_each_object(klp_transition_patch, obj) { if (!obj->patched) continue; klp_for_each_func(obj, func) { - ret = klp_check_stack_func(func, &trace); + ret = klp_check_stack_func(func, entries, nr_entries); if (ret) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d is sleeping on function %s\n", -- cgit v1.2.3 From 988ec8841ca1e22b2978fce0134d8267e838770e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:19 +0200 Subject: stacktrace: Remove obsolete functions No more users of the struct stack_trace based interfaces. Remove them. Remove the macro stubs for !CONFIG_STACKTRACE as well as they are pointless because the storage on the call sites is conditional on CONFIG_STACKTRACE already. No point to be 'smart'. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.524796783@linutronix.de --- kernel/stacktrace.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index b38333b3bc18..dd55312f3fe9 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -30,12 +30,6 @@ void stack_trace_print(unsigned long *entries, unsigned int nr_entries, } EXPORT_SYMBOL_GPL(stack_trace_print); -void print_stack_trace(struct stack_trace *trace, int spaces) -{ - stack_trace_print(trace->entries, trace->nr_entries, spaces); -} -EXPORT_SYMBOL_GPL(print_stack_trace); - /** * stack_trace_snprint - Print the entries in the stack trace into a buffer * @buf: Pointer to the print buffer @@ -72,14 +66,6 @@ int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_trace_snprint); -int snprint_stack_trace(char *buf, size_t size, - struct stack_trace *trace, int spaces) -{ - return stack_trace_snprint(buf, size, trace->entries, - trace->nr_entries, spaces); -} -EXPORT_SYMBOL_GPL(snprint_stack_trace); - /* * Architectures that do not implement save_stack_trace_*() * get these weak aliases and once-per-bootup warnings -- cgit v1.2.3 From 214d8ca6ee854f696f75e75511fe66b409e656db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:21 +0200 Subject: stacktrace: Provide common infrastructure All architectures which support stacktrace carry duplicated code and do the stack storage and filtering at the architecture side. Provide a consolidated interface with a callback function for consuming the stack entries provided by the architecture specific stack walker. This removes lots of duplicated code and allows to implement better filtering than 'skip number of entries' in the future without touching any architecture specific code. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: linux-arch@vger.kernel.org Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Link: https://lkml.kernel.org/r/20190425094803.713568606@linutronix.de --- kernel/stacktrace.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index dd55312f3fe9..27bafc1e271e 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -5,6 +5,8 @@ * * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar */ +#include +#include #include #include #include @@ -66,6 +68,175 @@ int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_trace_snprint); +#ifdef CONFIG_ARCH_STACKWALK + +struct stacktrace_cookie { + unsigned long *store; + unsigned int size; + unsigned int skip; + unsigned int len; +}; + +static bool stack_trace_consume_entry(void *cookie, unsigned long addr, + bool reliable) +{ + struct stacktrace_cookie *c = cookie; + + if (c->len >= c->size) + return false; + + if (c->skip > 0) { + c->skip--; + return true; + } + c->store[c->len++] = addr; + return c->len < c->size; +} + +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, + bool reliable) +{ + if (in_sched_functions(addr)) + return true; + return stack_trace_consume_entry(cookie, addr, reliable); +} + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + arch_stack_walk(consume_entry, &c, current, NULL); + return c.len; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + if (!try_get_task_stack(tsk)) + return 0; + + arch_stack_walk(consume_entry, &c, tsk, NULL); + put_task_stack(tsk); + return c.len; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr, + }; + + arch_stack_walk(consume_entry, &c, current, regs); + return c.len; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + int ret; + + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; + + ret = arch_stack_walk_reliable(consume_entry, &c, tsk); + put_task_stack(tsk); + return ret; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + + /* Trace user stack if not a kernel thread */ + if (!current->mm) + return 0; + + arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); + return c.len; +} +#endif + +#else /* CONFIG_ARCH_STACKWALK */ + /* * Architectures that do not implement save_stack_trace_*() * get these weak aliases and once-per-bootup warnings @@ -203,3 +374,5 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) return trace.nr_entries; } #endif /* CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* !CONFIG_ARCH_STACKWALK */ -- cgit v1.2.3 From 31adf2308f33dcae59009019675224be0978bc70 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 24 Apr 2019 10:55:48 +0200 Subject: livepatch: Convert error about unsupported reliable stacktrace into a warning The commit d0807da78e11d46f ("livepatch: Remove immediate feature") caused that any livepatch was refused when reliable stacktraces were not supported on the given architecture. The limitation is too strong. User space processes are safely migrated even when entering or leaving the kernel. Kthreads transition would need to get forced. But it is safe when: + The livepatch does not change the semantic of the code. + Callbacks do not depend on a safely finished transition. Suggested-by: Josh Poimboeuf Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..14f33ab6c583 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1003,11 +1003,10 @@ int klp_enable_patch(struct klp_patch *patch) return -ENODEV; if (!klp_have_reliable_stack()) { - pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -EOPNOTSUPP; + pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); + pr_warn("The livepatch transition may never complete.\n"); } - mutex_lock(&klp_mutex); ret = klp_init_patch_early(patch); -- cgit v1.2.3 From d671002be6bdd7f77a771e23bf3e95d1f16775e6 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 29 Apr 2019 20:26:31 +0800 Subject: locking/lockdep: Remove unnecessary unlikely() DEBUG_LOCKS_WARN_ON() already contains an unlikely(), there is no need for another one. Signed-off-by: zhengbin Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: houtao1@huawei.com Link: http://lkml.kernel.org/r/1556540791-23110-1-git-send-email-zhengbin13@huawei.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 25ecc6d3058b..6426d071a324 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3256,7 +3256,7 @@ void lockdep_hardirqs_on(unsigned long ip) /* * See the fine text that goes along with this variable definition. */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) return; /* -- cgit v1.2.3 From 08970ecf744e09837bb6620c95406710f4c81ae2 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 18 Apr 2019 16:54:01 +0100 Subject: irq/irqdomain: Fix typo in the comment on top of __irq_domain_alloc_irqs() The word 'number' has been misspelt in the comment on top of _irq_domain_alloc_irqs(). Signed-off-by: Julien Grall Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9ed29e4a7dbf..a453e229f99c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from - * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument -- cgit v1.2.3 From 43d8ce9d65a54846d378545770991e65838981e0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 26 Apr 2019 15:04:29 -0400 Subject: Provide in-kernel headers to make extending kernel easier Introduce in-kernel headers which are made available as an archive through proc (/proc/kheaders.tar.xz file). This archive makes it possible to run eBPF and other tracing programs that need to extend the kernel for tracing purposes without any dependency on the file system having headers. A github PR is sent for the corresponding BCC patch at: https://github.com/iovisor/bcc/pull/2312 On Android and embedded systems, it is common to switch kernels but not have kernel headers available on the file system. Further once a different kernel is booted, any headers stored on the file system will no longer be useful. This is an issue even well known to distros. By storing the headers as a compressed archive within the kernel, we can avoid these issues that have been a hindrance for a long time. The best way to use this feature is by building it in. Several users have a need for this, when they switch debug kernels, they do not want to update the filesystem or worry about it where to store the headers on it. However, the feature is also buildable as a module in case the user desires it not being part of the kernel image. This makes it possible to load and unload the headers from memory on demand. A tracing program can load the module, do its operations, and then unload the module to save kernel memory. The total memory needed is 3.3MB. By having the archive available at a fixed location independent of filesystem dependencies and conventions, all debugging tools can directly refer to the fixed location for the archive, without concerning with where the headers on a typical filesystem which significantly simplifies tooling that needs kernel headers. The code to read the headers is based on /proc/config.gz code and uses the same technique to embed the headers. Other approaches were discussed such as having an in-memory mountable filesystem, but that has drawbacks such as requiring an in-kernel xz decompressor which we don't have today, and requiring usage of 42 MB of kernel memory to host the decompressed headers at anytime. Also this approach is simpler than such approaches. Reviewed-by: Masahiro Yamada Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/.gitignore | 1 + kernel/Makefile | 10 ++++++ kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kheaders.c | 74 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 174 insertions(+) create mode 100755 kernel/gen_ikh_data.sh create mode 100644 kernel/kheaders.c (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,5 +1,6 @@ # # Generated files # +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..12399614c350 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) + +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz + +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include +#include +#include +#include +#include + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + return simple_read_from_buffer(buf, len, offset, + &kernel_headers_data, + &kernel_headers_data_end - + &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { + .read = ikheaders_read_current, + .llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current headers file */ + entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, + &ikheaders_file_ops); + if (!entry) + return -ENOMEM; + + proc_set_size(entry, + &kernel_headers_data_end - + &kernel_headers_data); + return 0; +} + +static void __exit ikheaders_cleanup(void) +{ + remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); -- cgit v1.2.3 From c7b6f29b6257532792fc722b68fcc0e00b5a856c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:43 -0700 Subject: bpf: Fail bpf_probe_write_user() while mm is switched When using a temporary mm, bpf_probe_write_user() should not be able to write to user memory, since user memory addresses may be used to map kernel memory. Detect these cases and fail bpf_probe_write_user() in such cases. Suggested-by: Jann Horn Reported-by: Jann Horn Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Borkmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-24-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/trace/bpf_trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,8 @@ #include #include +#include + #include "trace_probe.h" #include "trace.h" @@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. + * + * nmi_uaccess_okay() ensures the probe is not run in an interim + * state, when the task or mm are switched. This is specifically + * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || @@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; if (!access_ok(unsafe_ptr, size)) return -EPERM; -- cgit v1.2.3 From aad42dd44db086c79ca3f470ad563d2ac4ac218d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 26 Apr 2019 16:22:44 -0700 Subject: uprobes: Initialize uprobes earlier In order to have a separate address space for text poking, we need to duplicate init_mm early during start_kernel(). This, however, introduces a problem since uprobes functions are called from dup_mmap(), but uprobes is still not initialized in this early stage. Since uprobes initialization is necassary for fork, and since all the dependant initialization has been done when fork is initialized (percpu and vmalloc), move uprobes initialization to fork_init(). It does not seem uprobes introduces any security problem for the poking_mm. Crash and burn if uprobes initialization fails, similarly to other early initializations. Change the init_probes() name to probes_init() to match other early initialization functions name convention. Reported-by: kernel test robot Signed-off-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rick Edgecombe Cc: Rik van Riel Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: ard.biesheuvel@linaro.org Cc: deneen.t.dock@intel.com Cc: kernel-hardening@lists.openwall.com Cc: kristen@linux.intel.com Cc: linux_dti@icloud.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190426232303.28381-6-nadav.amit@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 +++----- kernel/fork.c | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..e6a0d6be87e3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; -static int __init init_uprobes(void) +void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - if (percpu_init_rwsem(&dup_mmap_sem)) - return -ENOMEM; + BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - return register_die_notifier(&uprobe_exception_nb); + BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -__initcall(init_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..44fba5e5e916 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -815,6 +815,7 @@ void __init fork_init(void) #endif lockdep_init_task(&init_task); + uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, -- cgit v1.2.3 From 13585fa0668c724efab9635aaeef6ec390217415 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:25 -0700 Subject: fork: Provide a function for copying init_mm Provide a function for copying init_mm. This function will be later used for setting a temporary mm. Tested-by: Masami Hiramatsu Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-6-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44fba5e5e916..fbe9dfcd8680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1299,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. +/** + * dup_mm() - duplicates an existing mm structure + * @tsk: the task_struct with which the new mm will be associated. + * @oldmm: the mm to duplicate. + * + * Allocates a new mm structure and duplicates the provided @oldmm structure + * content into it. + * + * Return: the duplicated mm or NULL on failure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk, + struct mm_struct *oldmm) { - struct mm_struct *mm, *oldmm = current->mm; + struct mm_struct *mm; int err; mm = allocate_mm(); @@ -1372,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) } retval = -ENOMEM; - mm = dup_mm(tsk); + mm = dup_mm(tsk, current->mm); if (!mm) goto fail_nomem; @@ -2187,6 +2194,11 @@ struct task_struct *fork_idle(int cpu) return task; } +struct mm_struct *copy_init_mm(void) +{ + return dup_mm(NULL, &init_mm); +} + /* * Ok, this is the main fork-routine. * -- cgit v1.2.3 From f2c65fb3221adc6b73b0549fc7ba892022db9797 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:31 -0700 Subject: x86/modules: Avoid breaking W^X while loading modules When modules and BPF filters are loaded, there is a time window in which some memory is both writable and executable. An attacker that has already found another vulnerability (e.g., a dangling pointer) might be able to exploit this behavior to overwrite kernel code. Prevent having writable executable PTEs in this stage. In addition, avoiding having W+X mappings can also slightly simplify the patching of modules code on initialization (e.g., by alternatives and static-key), as would be done in the next patch. This was actually the main motivation for this patch. To avoid having W+X mappings, set them initially as RW (NX) and after they are set as RO set them as X as well. Setting them as executable is done as a separate step to avoid one core in which the old PTE is cached (hence writable), and another which sees the updated PTE (executable), which would break the W^X protection. Suggested-by: Thomas Gleixner Suggested-by: Andy Lutomirski Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jessica Yu Cc: Kees Cook Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Rik van Riel Link: https://lkml.kernel.org/r/20190426001143.4983-12-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..2b2845ae983e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1950,8 +1950,13 @@ void module_enable_ro(const struct module *mod, bool after_init) return; frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->core_layout, set_memory_x); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_ro); if (after_init) -- cgit v1.2.3 From d63326928611600ad65baff54a70f53b02b3cdfe Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:35 -0700 Subject: mm/hibernation: Make hibernation handle unmapped pages Make hibernate handle unmapped pages on the direct map when CONFIG_ARCH_HAS_SET_ALIAS=y is set. These functions allow for setting pages to invalid configurations, so now hibernate should check if the pages have valid mappings and handle if they are unmapped when doing a hibernate save operation. Previously this checking was already done when CONFIG_DEBUG_PAGEALLOC=y was configured. It does not appear to have a big hibernating performance impact. The speed of the saving operation before this change was measured as 819.02 MB/s, and after was measured at 813.32 MB/s. Before: [ 4.670938] PM: Wrote 171996 kbytes in 0.21 seconds (819.02 MB/s) After: [ 4.504714] PM: Wrote 178932 kbytes in 0.22 seconds (813.32 MB/s) Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Acked-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-16-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/power/snapshot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) * safe_copy_page - Copy a page in a safe way. * * Check if the page we are going to copy is marked as present in the kernel - * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set - * and in that case kernel_page_present() always returns 'true'). + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'. */ static void safe_copy_page(void *dst, struct page *s_page) { -- cgit v1.2.3 From 1a7b7d9220819afe79d1ec5d759fe4349bd2453e Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:37 -0700 Subject: modules: Use vmalloc special flag Use new flag for handling freeing of special permissioned memory in vmalloc and remove places where memory was set RW before freeing which is no longer needed. Since freeing of VM_FLUSH_RESET_PERMS memory is not supported in an interrupt by vmalloc, the freeing of init sections is moved to a work queue. Instead of call_rcu it now uses synchronize_rcu() in the work queue. Lastly, there is now a WARN_ON in module_memfree since it should not be called in an interrupt with special memory as is required for VM_FLUSH_RESET_PERMS. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jessica Yu Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-18-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/module.c | 77 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2b2845ae983e..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); +/* Work queue for freeing init sections in success case */ +static struct work_struct init_free_wq; +static struct llist_head init_free_list; + #ifdef CONFIG_MODULES_TREE_LOOKUP /* @@ -1949,6 +1953,8 @@ void module_enable_ro(const struct module *mod, bool after_init) if (!rodata_enabled) return; + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); frob_text(&mod->core_layout, set_memory_ro); frob_text(&mod->core_layout, set_memory_x); @@ -1972,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -static void module_disable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_x); - frob_ro_after_init(&mod->core_layout, set_memory_x); - frob_writable_data(&mod->core_layout, set_memory_x); - frob_rodata(&mod->init_layout, set_memory_x); - frob_writable_data(&mod->init_layout, set_memory_x); -} - /* Iterate through all modules and set each module's text as RW */ void set_all_modules_text_rw(void) { @@ -2024,23 +2021,8 @@ void set_all_modules_text_ro(void) } mutex_unlock(&module_mutex); } - -static void disable_ro_nx(const struct module_layout *layout) -{ - if (rodata_enabled) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); - frob_ro_after_init(layout, set_memory_rw); - } - frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_x); - frob_writable_data(layout, set_memory_x); -} - #else -static void disable_ro_nx(const struct module_layout *layout) { } static void module_enable_nx(const struct module *mod) { } -static void module_disable_nx(const struct module *mod) { } #endif #ifdef CONFIG_LIVEPATCH @@ -2120,6 +2102,11 @@ static void free_module_elf(struct module *mod) void __weak module_memfree(void *module_region) { + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); vfree(module_region); } @@ -2171,7 +2158,6 @@ static void free_module(struct module *mod) mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); kfree(mod->args); @@ -2181,7 +2167,6 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); } @@ -3420,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) /* For freeing module_init on success, in case kallsyms traversing */ struct mod_initfree { - struct rcu_head rcu; + struct llist_node node; void *module_init; }; -static void do_free_init(struct rcu_head *head) +static void do_free_init(struct work_struct *w) { - struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); - module_memfree(m->module_init); - kfree(m); + struct llist_node *pos, *n, *list; + struct mod_initfree *initfree; + + list = llist_del_all(&init_free_list); + + synchronize_rcu(); + + llist_for_each_safe(pos, n, list) { + initfree = container_of(pos, struct mod_initfree, node); + module_memfree(initfree->module_init); + kfree(initfree); + } } +static int __init modules_wq_init(void) +{ + INIT_WORK(&init_free_wq, do_free_init); + init_llist_head(&init_free_list); + return 0; +} +module_init(modules_wq_init); + /* * This is where the real work happens. * @@ -3507,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) #endif module_enable_ro(mod, true); mod_tree_remove_init(mod); - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; @@ -3518,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path, so use actual RCU here. + * path. module_memfree() cannot be called in an interrupt, so do the + * work and call synchronize_rcu() in a work queue. + * * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie * rcu_barrier() */ - call_rcu(&freeinit->rcu, do_free_init); + if (llist_add(&freeinit->node, &init_free_list)) + schedule_work(&init_free_wq); + mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3822,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* we can't deallocate the module until we clear memory protection */ - module_disable_ro(mod); - module_disable_nx(mod); - ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); -- cgit v1.2.3 From d53d2f78ceadba081fc7785570798c3c8d50a718 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:38 -0700 Subject: bpf: Use vmalloc special flag Use new flag VM_FLUSH_RESET_PERMS for handling freeing of special permissioned memory in vmalloc and remove places where memory was set RW before freeing which is no longer needed. Don't track if the memory is RO anymore because it is now tracked in vmalloc. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Borkmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-19-namit@vmware.com Signed-off-by: Ingo Molnar --- kernel/bpf/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); -- cgit v1.2.3 From 2bd1298ac17777525a41c8425521f569e412df14 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Tue, 30 Apr 2019 15:42:22 +0530 Subject: genirq: Introduce irq_chip_{request,release}_resource_parent() apis Introduce irq_chip_{request,release}_resource_parent() apis so that these can be used in hierarchical irqchips. Signed-off-by: Lokesh Vutla Signed-off-by: Marc Zyngier --- kernel/irq/chip.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 51128bea3846..29d6c7d070b4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1459,6 +1459,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); + +/** + * irq_chip_request_resources_parent - Request resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +int irq_chip_request_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + + if (data->chip->irq_request_resources) + return data->chip->irq_request_resources(data); + + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); + +/** + * irq_chip_release_resources_parent - Release resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_release_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_release_resources) + data->chip->irq_release_resources(data); +} +EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); #endif /** -- cgit v1.2.3 From 524845ff9c473d315468fa3b54054a7e6b2d95cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Apr 2019 22:31:29 -0400 Subject: bpf: switch to ->free_inode() Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Al Viro --- kernel/bpf/inode.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4a8f390a2b82..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -566,9 +566,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_destroy_inode_deferred(struct rcu_head *head) +static void bpf_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); enum bpf_type type; if (S_ISLNK(inode->i_mode)) @@ -578,16 +577,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head) free_inode_nonrcu(inode); } -static void bpf_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); -} - static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { -- cgit v1.2.3 From 98587c2d894c34c9af5cd84ca169e1cd493aa692 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 30 Apr 2019 12:33:45 +0200 Subject: s390: simplify disabled_wait The disabled_wait() function uses its argument as the PSW address when it stops the CPU with a wait PSW that is disabled for interrupts. The different callers sometimes use a specific number like 0xdeadbeef to indicate a specific failure, the early boot code uses 0 and some other calls sites use __builtin_return_address(0). At the time a dump is created the current PSW and the registers of a CPU are written to lowcore to make them avaiable to the dump analysis tool. For a CPU stopped with disabled_wait the PSW and the registers do not really make sense together, the PSW address does not point to the function the registers belong to. Simplify disabled_wait() by using _THIS_IP_ for the PSW address and drop the argument to the function. Signed-off-by: Martin Schwidefsky --- kernel/panic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 0ae0d7332f12..c1fcaad337b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -318,12 +318,7 @@ void panic(const char *fmt, ...) } #endif #if defined(CONFIG_S390) - { - unsigned long caller; - - caller = (unsigned long)__builtin_return_address(0); - disabled_wait(caller); - } + disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); -- cgit v1.2.3 From a5d5092c9285f6c8937b56f9c6ff2b22d818fc25 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 26 Feb 2019 13:16:14 -0600 Subject: gdbstub: mark expected switch fall-throughs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warnings: kernel/debug/gdbstub.c: In function ‘gdb_serial_stub’: kernel/debug/gdbstub.c:1031:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (remcom_in_buffer[1] == '\0') { ^ kernel/debug/gdbstub.c:1036:3: note: here case 'C': /* Exception passing */ ^~~~ kernel/debug/gdbstub.c:1040:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (tmp == 0) ^ kernel/debug/gdbstub.c:1043:3: note: here case 'c': /* Continue packet */ ^~~~ kernel/debug/gdbstub.c:1050:4: warning: this statement may fall through [-Wimplicit-fallthrough=] dbg_activate_sw_breakpoints(); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/debug/gdbstub.c:1052:3: note: here default: ^~~~~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 Notice that, in this particular case, the code comment is modified in accordance with what GCC is expecting to find. This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Acked-by: Jason Wessel Signed-off-by: Daniel Thompson --- kernel/debug/gdbstub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..9f267b8905b4 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif + /* Fall through */ case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through on tmp < 0 */ + /* Fall through - on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through to default processing */ + /* Fall through - to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, -- cgit v1.2.3 From 4cc168eaf3b67d76547fb420c22abe22a3c86003 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Apr 2019 11:33:42 -0500 Subject: gdbstub: Replace strcpy() by strscpy() The strcpy() function is being deprecated. Replace it by the safer strscpy() and fix the following Coverity warning: "You might overrun the 1024-character fixed-size string remcom_in_buffer by copying cmd without checking the length." Addresses-Coverity-ID: 138999 ("Copy into fixed size buffer") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Thompson --- kernel/debug/gdbstub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 9f267b8905b4..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1095,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) return error; case 's': case 'c': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); return 0; case '$': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); gdbstub_prev_in_buf_pos = 0; return 0; -- cgit v1.2.3 From 9b555c4d784c468b4167eef9ab621b5203e4f479 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Apr 2019 11:21:06 -0500 Subject: kdb: kdb_support: replace strcpy() by strscpy() The strcpy() function is being deprecated. Replace it by the safer strscpy() and fix the following Coverity warning: "You might overrun the 129-character fixed-size string ks_namebuf by copying name without checking the length." Addresses-Coverity-ID: 138995 ("Copy into fixed size buffer") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_support.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) while ((name = kdb_walk_kallsyms(&pos))) { if (strncmp(name, prefix_name, prefix_len) == 0) { - strcpy(ks_namebuf, name); + strscpy(ks_namebuf, name, sizeof(ks_namebuf)); /* Work out the longest name that matches the prefix */ if (++number == 1) { prev_len = min_t(int, max_len-1, -- cgit v1.2.3 From dbfe67334a1767bcb7be8b50bd237b22b272ef23 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:04 -0700 Subject: tracing: kdb: The skip_lines parameter should have been skip_entries The things skipped by kdb's "ftdump" command when you pass it a parameter has always been entries, not lines. The difference usually doesn't matter but when the trace buffer has multi-line entries (like a stack dump) it can matter. Let's fix this both in the help text for ftdump and also in the local variable names. Link: http://lkml.kernel.org/r/20190319171206.97107-1-dianders@chromium.org Acked-by: Daniel Thompson Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kdb.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 810d78a8d14c..4b666643d69f 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,7 +17,7 @@ #include "trace.h" #include "trace_output.h" -static void ftrace_dump_buf(int skip_lines, long cpu_file) +static void ftrace_dump_buf(int skip_entries, long cpu_file) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; @@ -70,11 +70,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); cnt++; - if (!skip_lines) { + if (!skip_entries) { print_trace_line(&iter); trace_printk_seq(&iter.seq); } else { - skip_lines--; + skip_entries--; } if (KDB_FLAG(CMD_INTERRUPT)) @@ -106,7 +106,7 @@ out: */ static int kdb_ftdump(int argc, const char **argv) { - int skip_lines = 0; + int skip_entries = 0; long cpu_file; char *cp; @@ -114,9 +114,9 @@ static int kdb_ftdump(int argc, const char **argv) return KDB_ARGCOUNT; if (argc) { - skip_lines = simple_strtol(argv[1], &cp, 0); + skip_entries = simple_strtol(argv[1], &cp, 0); if (*cp) - skip_lines = 0; + skip_entries = 0; } if (argc == 2) { @@ -129,7 +129,7 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; - ftrace_dump_buf(skip_lines, cpu_file); + ftrace_dump_buf(skip_entries, cpu_file); kdb_trap_printk--; return 0; @@ -137,7 +137,7 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } -- cgit v1.2.3 From ecffc8a8c7301f6f3c731ba23e38cd049a046416 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:05 -0700 Subject: tracing: Add trace_total_entries() / trace_total_entries_cpu() These two new exported functions will be used in a future patch by kdb_ftdump() to quickly skip all but the last few trace entries. Link: http://lkml.kernel.org/r/20190319171206.97107-2-dianders@chromium.org Acked-by: Daniel Thompson Suggested-by: Steven Rostedt Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 65 ++++++++++++++++++++++++++++++++++++++++------------ kernel/trace/trace.h | 3 +++ 2 files changed, 53 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2bc18de7f0dc..dcb9adb44be9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3492,34 +3492,69 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) +{ + unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + static void get_total_entries(struct trace_buffer *buf, unsigned long *total, unsigned long *entries) { - unsigned long count; + unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(buf->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(buf->buffer, cpu); - *entries += count; + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; } } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->trace_buffer, &total, &entries); + + return entries; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index da00a3d508c1..33f14b9e78b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -721,6 +721,9 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, -- cgit v1.2.3 From 03197fc02b356606355d7ede343b18e3e3737771 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:06 -0700 Subject: tracing: kdb: Allow ftdump to skip all but the last few entries The 'ftdump' command in kdb is currently a bit of a last resort, at least if you have lots of traces turned on. It's going to print a whole boatload of data out your serial port which is probably running at 115200. This could easily take many, many minutes. Usually you're most interested in what's at the _end_ of the ftrace buffer, AKA what happened most recently. That means you've got to wait the full time for the dump. The 'ftdump' command does attempt to help you a little bit by allowing you to skip a fixed number of entries. Unfortunately it provides no way for you to know how many entries you should skip. Let's do similar to python and allow you to use a negative number to indicate that you want to skip all entries except the last few. This allows you to quickly see what you want. Note that we also change the printout in ftdump to print the (positive) number of entries actually skipped since that could be helpful to know when you've specified a negative skip count. Link: http://lkml.kernel.org/r/20190319171206.97107-3-dianders@chromium.org Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kdb.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 4b666643d69f..6c1ae6b752d1 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,29 +17,25 @@ #include "trace.h" #include "trace_output.h" +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + static void ftrace_dump_buf(int skip_entries, long cpu_file) { - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; - trace_init_global_iter(&iter); - iter.buffer_iter = buffer_iter; tr = iter.tr; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); + if (skip_entries) + kdb_printf("(skipping %d entries)\n", skip_entries); /* reset all but tr, trace, and overruns */ memset(&iter.seq, 0, @@ -89,10 +85,6 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) out: tr->trace_flags = old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - for_each_tracing_cpu(cpu) { if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); @@ -109,6 +101,8 @@ static int kdb_ftdump(int argc, const char **argv) int skip_entries = 0; long cpu_file; char *cp; + int cnt; + int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -129,7 +123,29 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + /* A negative skip_entries means skip all but the last entries */ + if (skip_entries < 0) { + if (cpu_file == RING_BUFFER_ALL_CPUS) + cnt = trace_total_entries(NULL); + else + cnt = trace_total_entries_cpu(NULL, cpu_file); + skip_entries = max(cnt + skip_entries, 0); + } + ftrace_dump_buf(skip_entries, cpu_file); + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + kdb_trap_printk--; return 0; @@ -138,7 +154,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", - "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); + "Dump ftrace log; -skip dumps last #entries", 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } -- cgit v1.2.3 From 77a5352ba977d2554643e3797e10823d0d03dcf7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:44 +1000 Subject: sched/core: Allow the remote scheduler tick to be started on CPU0 This has no effect yet because CPU0 will always be a housekeeping CPU until a later change. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-2-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de8ab411826c..cef22c5499a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5866,7 +5866,7 @@ void __init sched_init_smp(void) static int __init migration_init(void) { - sched_rq_cpu_starting(smp_processor_id()); + sched_cpu_starting(smp_processor_id()); return 0; } early_initcall(migration_init); -- cgit v1.2.3 From aaebdf8d68479f78d9f72b239684f70fbb0722c6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 1 May 2019 14:58:18 +0100 Subject: genirq/msi: Add a new field in msi_desc to store an IOMMU cookie When an MSI doorbell is located downstream of an IOMMU, it is required to swizzle the physical address with an appropriately-mapped IOVA for any device attached to one of our DMA ops domain. At the moment, the allocation of the mapping may be done when composing the message. However, the composing may be done in non-preemtible context while the allocation requires to be called from preemptible context. A follow-up change will split the current logic in two functions requiring to keep an IOMMU cookie per MSI. A new field is introduced in msi_desc to store an IOMMU cookie. As the cookie may not be required in some configuration, the field is protected under a new config CONFIG_IRQ_MSI_IOMMU. A pair of helpers has also been introduced to access the field. Signed-off-by: Julien Grall Reviewed-by: Robin Murphy Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- kernel/irq/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..8fee06625c37 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -91,6 +91,9 @@ config GENERIC_MSI_IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY select GENERIC_MSI_IRQ +config IRQ_MSI_IOMMU + bool + config HANDLE_DOMAIN_IRQ bool -- cgit v1.2.3 From 2f1a6fbbef7781382850c3104ecb658f21b5d460 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:45 +1000 Subject: power/suspend: Add function to disable secondaries for suspend This adds a function to disable secondary CPUs for suspend that are not necessarily non-zero / non-boot CPUs. Platforms will be able to use this to suspend using non-zero CPUs. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-3-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/kexec_core.c | 4 ++-- kernel/power/hibernate.c | 12 ++++++------ kernel/power/suspend.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..fd5c95ff9251 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1150,7 +1150,7 @@ int kernel_kexec(void) error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); @@ -1183,7 +1183,7 @@ int kernel_kexec(void) Enable_irqs: local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..cfc7a57049e4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -281,7 +281,7 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; @@ -323,7 +323,7 @@ static int create_image(int platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_finish: platform_finish(platform_mode); @@ -417,7 +417,7 @@ int hibernation_snapshot(int platform_mode) int __weak hibernate_resume_nonboot_cpu_disable(void) { - return disable_nonboot_cpus(); + return suspend_disable_secondary_cpus(); } /** @@ -486,7 +486,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -564,7 +564,7 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; @@ -586,7 +586,7 @@ int hibernation_platform_enter(void) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..59b6def23046 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -428,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -458,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) BUG_ON(irqs_disabled()); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_wake: platform_resume_noirq(state); -- cgit v1.2.3 From 9ca12ac04bb7d7cfb28aa549dcd3d15761f15543 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:46 +1000 Subject: kernel/cpu: Allow non-zero CPU to be primary for suspend / kexec freeze This patch provides an arch option, ARCH_SUSPEND_NONZERO_CPU, to opt-in to allowing suspend to occur on one of the housekeeping CPUs rather than hardcoded CPU0. This will allow CPU0 to be a nohz_full CPU with a later change. It may be possible for platforms with hardware/firmware restrictions on suspend/wake effectively support this by handing off the final stage to CPU0 when kernel housekeeping is no longer required. Another option is to make housekeeping / nohz_full mask dynamic at runtime, but the complexity could not be justified at this time. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-4-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 10 +++++++++- kernel/power/Kconfig | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6754f3ecfd94..d1bf6e2b4752 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1199,8 +1200,15 @@ int freeze_secondary_cpus(int primary) int cpu, error = 0; cpu_maps_update_begin(); - if (!cpu_online(primary)) + if (primary == -1) { primary = cpumask_first(cpu_online_mask); + if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) + primary = housekeeping_any_cpu(HK_FLAG_TIMER); + } else { + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); + } + /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..9bbaaab14b36 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -114,6 +114,15 @@ config PM_SLEEP_SMP depends on PM_SLEEP select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU + def_bool y + depends on PM_SLEEP_SMP + depends on ARCH_SUSPEND_NONZERO_CPU + ---help--- + If an arch can suspend (for suspend, hibernate, kexec, etc) on a + non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This + will allow nohz_full mask to include CPU0. + config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP -- cgit v1.2.3 From 9219565aa89033a9cfdae788c1940473a1253d6c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:47 +1000 Subject: sched/isolation: Require a present CPU in housekeeping mask During housekeeping mask setup, currently a possible CPU is required. That does not guarantee the CPU would be available at boot time, so check to ensure that at least one present CPU is in the mask. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-5-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b02d148e7672..687302051a27 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -65,6 +65,7 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + cpumask_var_t tmp; int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) return 0; } + alloc_bootmem_cpumask_var(&tmp); if (!housekeeping_flags) { alloc_bootmem_cpumask_var(&housekeeping_mask); cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - if (cpumask_empty(housekeeping_mask)) + + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) { + pr_warn("Housekeeping: must include one present CPU, " + "using boot CPU:%d\n", smp_processor_id()); __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + } } else { - cpumask_var_t tmp; - - alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); if (!cpumask_equal(tmp, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - free_bootmem_cpumask_var(tmp); } + free_bootmem_cpumask_var(tmp); if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { -- cgit v1.2.3 From 08ae95f4fd3b38b257f5dc7e6507e071c27ba0d5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:48 +1000 Subject: nohz_full: Allow the boot CPU to be nohz_full Allow the boot CPU/CPU0 to be nohz_full. Have the boot CPU take the do_timer duty during boot until a housekeeping CPU can take over. This is supported when CONFIG_PM_SLEEP_SMP is not configured, or when it is configured and the arch allows suspend on non-zero CPUs. nohz_full has been trialed at a large supercomputer site and found to significantly reduce jitter. In order to deploy it in production, they need CPU0 to be nohz_full because their job control system requires the application CPUs to start from 0, and the housekeeping CPUs are placed higher. An equivalent job scheduling that uses CPU0 for housekeeping could be achieved by modifying their system, but it is preferable if nohz_full can support their environment without modification. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-6-npiggin@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-common.c | 50 +++++++++++++++++++++++++++++++++++++++++++---- kernel/time/tick-sched.c | 34 ++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df401463a191..e49e8091f9ac 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -46,6 +46,14 @@ ktime_t tick_period; * procedure also covers cpu hotplug. */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif /* * Debugging: see timer_list.c @@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ + int cpu = *(unsigned int *)info; + + WARN_ON(tick_do_timer_cpu != smp_processor_id()); + + tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ + int cpu = smp_processor_id(); + int from = tick_do_timer_boot_cpu; + + if (from >= 0 && from != cpu) + smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif + /* * Setup the tick device */ @@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - else - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); tick_period = NSEC_PER_SEC / HZ; +#ifdef CONFIG_NO_HZ_FULL + /* + * The boot CPU may be nohz_full, in which case set + * tick_do_timer_boot_cpu so the first housekeeping + * secondary that comes up will take do_timer from + * us. + */ + if (tick_nohz_full_cpu(cpu)) + tick_do_timer_boot_cpu = cpu; + + } else if (tick_do_timer_boot_cpu != -1 && + !tick_nohz_full_cpu(cpu)) { + tick_take_do_timer_from_boot(); + tick_do_timer_boot_cpu = -1; + WARN_ON(tick_do_timer_cpu != cpu); +#endif } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..4aa917acbe1c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. + * + * If nohz_full is enabled, this should not happen because the + * tick_do_timer_cpu never relinquishes. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) - && !tick_nohz_full_cpu(cpu)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL + WARN_ON(tick_nohz_full_running); +#endif tick_do_timer_cpu = cpu; + } #endif /* Check, if the jiffies need an update */ @@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) static int tick_nohz_cpu_down(unsigned int cpu) { /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks + * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) @@ -423,12 +429,15 @@ void __init tick_nohz_init(void) return; } - cpu = smp_processor_id(); + if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && + !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { + cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", - cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warn("NO_HZ: Clearing %d from nohz_full range " + "for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } } for_each_cpu(cpu, tick_nohz_full_mask) @@ -904,8 +913,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) /* * Boot safety: make sure the timekeeping duty has been * assigned before entering dyntick-idle mode, + * tick_do_timer_cpu is TICK_DO_TIMER_BOOT */ - if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) + return false; + + /* Should not happen for nohz-full */ + if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } -- cgit v1.2.3 From 4d141ab3416d90f87775f5dee725efdf40110a8f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 3 May 2019 15:26:24 +0200 Subject: livepatch: Remove custom kobject state handling kobject_init() always succeeds and sets the reference count to 1. It allows to always free the structures via kobject_put() and the related release callback. Note that the custom kobject state handling was used only because we did not know that kobject_put() can and actually should get called even when kobject_init_and_add() fails. The patch should not change the existing behavior. Suggested-by: "Tobin C. Harding" Signed-off-by: Petr Mladek Reviewed-by: Kamalesh Babulal Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 56 +++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 14f33ab6c583..42385f23252a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -426,6 +426,9 @@ static void klp_free_object_dynamic(struct klp_object *obj) kfree(obj); } +static struct kobj_type klp_ktype_object; +static struct kobj_type klp_ktype_func; + static struct klp_object *klp_alloc_object_dynamic(const char *name) { struct klp_object *obj; @@ -443,6 +446,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) } INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); obj->dynamic = true; return obj; @@ -471,6 +475,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, } } + kobject_init(&func->kobj, &klp_ktype_func); /* * func->new_func is same as func->old_func. These addresses are * set when the object is loaded, see klp_init_object_loaded(). @@ -588,13 +593,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only) continue; list_del(&func->node); - - /* Might be called from klp_init_patch() error path. */ - if (func->kobj_added) { - kobject_put(&func->kobj); - } else if (func->nop) { - klp_free_func_nop(func); - } + kobject_put(&func->kobj); } } @@ -624,13 +623,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only) continue; list_del(&obj->node); - - /* Might be called from klp_init_patch() error path. */ - if (obj->kobj_added) { - kobject_put(&obj->kobj); - } else if (obj->dynamic) { - klp_free_object_dynamic(obj); - } + kobject_put(&obj->kobj); } } @@ -675,10 +668,8 @@ static void klp_free_patch_finish(struct klp_patch *patch) * this is called when the patch gets disabled and it * cannot get enabled again. */ - if (patch->kobj_added) { - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); - } + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); /* Put the module after the last access to struct klp_patch. */ if (!patch->forced) @@ -700,8 +691,6 @@ static void klp_free_patch_work_fn(struct work_struct *work) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - int ret; - if (!func->old_name) return -EINVAL; @@ -724,13 +713,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); - if (!ret) - func->kobj_added = true; - - return ret; + return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", + func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -801,11 +786,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, - &patch->kobj, "%s", name); + ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); if (ret) return ret; - obj->kobj_added = true; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); @@ -829,7 +812,7 @@ static int klp_init_patch_early(struct klp_patch *patch) INIT_LIST_HEAD(&patch->list); INIT_LIST_HEAD(&patch->obj_list); - patch->kobj_added = false; + kobject_init(&patch->kobj, &klp_ktype_patch); patch->enabled = false; patch->forced = false; INIT_WORK(&patch->free_work, klp_free_patch_work_fn); @@ -840,11 +823,11 @@ static int klp_init_patch_early(struct klp_patch *patch) return -EINVAL; INIT_LIST_HEAD(&obj->func_list); - obj->kobj_added = false; + kobject_init(&obj->kobj, &klp_ktype_object); list_add_tail(&obj->node, &patch->obj_list); klp_for_each_func_static(obj, func) { - func->kobj_added = false; + kobject_init(&func->kobj, &klp_ktype_func); list_add_tail(&func->node, &obj->func_list); } } @@ -860,11 +843,9 @@ static int klp_init_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, "%s", patch->mod->name); + ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); if (ret) return ret; - patch->kobj_added = true; if (patch->replace) { ret = klp_add_nops(patch); @@ -926,9 +907,6 @@ static int __klp_enable_patch(struct klp_patch *patch) if (WARN_ON(patch->enabled)) return -EINVAL; - if (!patch->kobj_added) - return -EINVAL; - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_init_transition(patch, KLP_PATCHED); -- cgit v1.2.3 From f68d67cf2f83dc82675969724b59ca7c6da43fa9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 3 May 2019 15:26:25 +0200 Subject: livepatch: Remove duplicated code for early initialization kobject_init() call added one more operation that has to be done when doing the early initialization of both static and dynamic livepatch structures. It would have been easier when the early initialization code was not duplicated. Let's deduplicate it for future generations of livepatching hackers. The patch does not change the existing behavior. Signed-off-by: Petr Mladek Reviewed-by: Kamalesh Babulal Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 42385f23252a..f12c0eabd843 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -426,10 +426,13 @@ static void klp_free_object_dynamic(struct klp_object *obj) kfree(obj); } -static struct kobj_type klp_ktype_object; -static struct kobj_type klp_ktype_func; +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj); -static struct klp_object *klp_alloc_object_dynamic(const char *name) +static struct klp_object *klp_alloc_object_dynamic(const char *name, + struct klp_patch *patch) { struct klp_object *obj; @@ -445,8 +448,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) } } - INIT_LIST_HEAD(&obj->func_list); - kobject_init(&obj->kobj, &klp_ktype_object); + klp_init_object_early(patch, obj); obj->dynamic = true; return obj; @@ -475,7 +477,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, } } - kobject_init(&func->kobj, &klp_ktype_func); + klp_init_func_early(obj, func); /* * func->new_func is same as func->old_func. These addresses are * set when the object is loaded, see klp_init_object_loaded(). @@ -495,11 +497,9 @@ static int klp_add_object_nops(struct klp_patch *patch, obj = klp_find_object(patch, old_obj); if (!obj) { - obj = klp_alloc_object_dynamic(old_obj->name); + obj = klp_alloc_object_dynamic(old_obj->name, patch); if (!obj) return -ENOMEM; - - list_add_tail(&obj->node, &patch->obj_list); } klp_for_each_func(old_obj, old_func) { @@ -510,8 +510,6 @@ static int klp_add_object_nops(struct klp_patch *patch, func = klp_alloc_func_nop(old_func, obj); if (!func) return -ENOMEM; - - list_add_tail(&func->node, &obj->func_list); } return 0; @@ -802,6 +800,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return ret; } +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func) +{ + kobject_init(&func->kobj, &klp_ktype_func); + list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj) +{ + INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); + list_add_tail(&obj->node, &patch->obj_list); +} + static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; @@ -822,13 +835,10 @@ static int klp_init_patch_early(struct klp_patch *patch) if (!obj->funcs) return -EINVAL; - INIT_LIST_HEAD(&obj->func_list); - kobject_init(&obj->kobj, &klp_ktype_object); - list_add_tail(&obj->node, &patch->obj_list); + klp_init_object_early(patch, obj); klp_for_each_func_static(obj, func) { - kobject_init(&func->kobj, &klp_ktype_func); - list_add_tail(&func->node, &obj->func_list); + klp_init_func_early(obj, func); } } -- cgit v1.2.3 From 13bf5ced93775ffccb53527a9d862e023a9daa03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Mar 2019 15:44:06 +0100 Subject: dma-mapping: add a Kconfig symbol to indicate arch_dma_prep_coherent presence Add a Kconfig symbol that indicates an architecture provides a arch_dma_prep_coherent implementation, and provide a stub otherwise. This will allow the generic dma-iommu code to use it while still allowing to be built for cache coherent architectures. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 52b704e2b97a..83d711f8d665 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -38,6 +38,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool +config ARCH_HAS_DMA_PREP_COHERENT + bool + config ARCH_HAS_DMA_COHERENT_TO_PFN bool -- cgit v1.2.3 From 533307dc20a9e84a0687d4ca24aeb669516c0243 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 30 Apr 2019 17:57:29 +0800 Subject: cgroup: Remove unused cgrp variable The 'cgrp' is set but not used in commit <76f969e8948d8> ("cgroup: cgroup v2 freezer"). Remove it to avoid [-Wunused-but-set-variable] warning. Cc: Tejun Heo Signed-off-by: Shaokun Zhang Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57edcf398d71..4fe9f7f1a3fa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5864,11 +5864,8 @@ void cgroup_post_fork(struct task_struct *child) * the task into the frozen state. */ if (unlikely(cgroup_task_freeze(child))) { - struct cgroup *cgrp; - spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); - cgrp = cset->dfl_cgrp; child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); -- cgit v1.2.3 From cb2c4cd87874a7975b7b8615866b3a87bae10aab Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Apr 2019 10:59:44 -0700 Subject: cgroup: prevent spurious transition into non-frozen state If freezing of a cgroup races with waking of a task from the frozen state (like waiting in vfork() or in do_signal_stop()), a spurious transition of the cgroup state can happen. The task enters cgroup_leave_frozen(true), the cgroup->nr_frozen_tasks counter decrements, and the cgroup is switched to the unfrozen state. To prevent it, let's reserve cgroup_leave_frozen(true) for terminating processes and use cgroup_leave_frozen(false) otherwise. To avoid busy-looping in the signal handling loop waiting for JOBCTL_TRAP_FREEZE set from the cgroup freezing path, let's do it explicitly in cgroup_leave_frozen(), if the task is going to stay frozen. Suggested-by: Oleg Nesterov Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/freezer.c | 16 +++++----------- kernel/signal.c | 2 +- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3bfbb3c8baf3..c321e768f8d3 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -139,19 +139,13 @@ void cgroup_leave_frozen(bool always_leave) cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } spin_unlock_irq(&css_set_lock); - - if (unlikely(current->frozen)) { - /* - * If the task remained in the frozen state, - * make sure it won't reach userspace without - * entering the signal handling loop. - */ - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } } /* diff --git a/kernel/signal.c b/kernel/signal.c index 095e0fc57b25..16b72f4f14df 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2514,7 +2514,7 @@ relock: */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); - cgroup_leave_frozen(true); + cgroup_leave_frozen(false); goto relock; } -- cgit v1.2.3 From 96b9c592def5d7203bdad1337d9c92a2183de5cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Apr 2019 10:59:45 -0700 Subject: cgroup: get rid of cgroup_freezer_frozen_exit() A task should never enter the exit path with the task->frozen bit set. Any frozen task must enter the signal handling loop and the only way to escape is through cgroup_leave_frozen(true), which unconditionally drops the task->frozen bit. So it means that cgroyp_freezer_frozen_exit() has zero chances to be called and has to be removed. Let's put a WARN_ON_ONCE() instead of the cgroup_freezer_frozen_exit() call to catch any potential leak of the task's frozen bit. Suggested-by: Oleg Nesterov Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 ++--- kernel/cgroup/freezer.c | 10 ---------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4fe9f7f1a3fa..327f37c9fdfa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5926,9 +5926,8 @@ void cgroup_exit(struct task_struct *tsk) css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; - if (unlikely(cgroup_task_frozen(tsk))) - cgroup_freezer_frozen_exit(tsk); - else if (unlikely(cgroup_task_freeze(tsk))) + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index c321e768f8d3..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -248,16 +248,6 @@ void cgroup_freezer_migrate_task(struct task_struct *task, cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } -void cgroup_freezer_frozen_exit(struct task_struct *task) -{ - struct cgroup *cgrp = task_dfl_cgroup(task); - - lockdep_assert_held(&css_set_lock); - - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); -} - void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; -- cgit v1.2.3 From 1900da520c9fdc3a7cf00d21638f7c8721d5ac7f Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Sun, 21 Apr 2019 19:47:27 +0800 Subject: kernel: cgroup: fix misuse of %x Pointers should be printed with %p or %px rather than cast to unsigned long type and printed with %lx. Change %lx to %p to print the pointers. Signed-off-by: Fuqian Huang Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } -- cgit v1.2.3 From a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 28 Apr 2019 17:25:38 -0400 Subject: locking/rwsem: Prevent decrement of reader count before increment During my rwsem testing, it was found that after a down_read(), the reader count may occasionally become 0 or even negative. Consequently, a writer may steal the lock at that time and execute with the reader in parallel thus breaking the mutual exclusion guarantee of the write lock. In other words, both readers and writer can become rwsem owners simultaneously. The current reader wakeup code does it in one pass to clear waiter->task and put them into wake_q before fully incrementing the reader count. Once waiter->task is cleared, the corresponding reader may see it, finish the critical section and do unlock to decrement the count before the count is incremented. This is not a problem if there is only one reader to wake up as the count has been pre-incremented by 1. It is a problem if there are more than one readers to be woken up and writer can steal the lock. The wakeup was actually done in 2 passes before the following v4.9 commit: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") To fix this problem, the wakeup is now done in two passes again. In the first pass, we collect the readers and count them. The reader count is then fully incremented. In the second pass, the waiter->task is then cleared and they are put into wake_q to be woken up later. Signed-off-by: Waiman Long Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 46 ++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b3ee9948bf1..0b1f77957240 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, { struct rwsem_waiter *waiter, *tmp; long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; /* * Take a peek at the queue head waiter such that we can determine @@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * of the queue. We know that woken will be at least 1 as we accounted * for above. Note we increment the 'active part' of the count by the * number of readers before waking any processes up. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. */ - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { - struct task_struct *tsk; - + list_for_each_entry(waiter, &sem->wait_list, list) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; woken++; - tsk = waiter->task; + } + list_cut_before(&wlist, &sem->wait_list, &waiter->list); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + tsk = waiter->task; get_task_struct(tsk); - list_del(&waiter->list); + /* * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add_safe(wake_q, tsk); } - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); } /* -- cgit v1.2.3 From b3e5838252665ee4cfa76b82bdf1198dca81e5be Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 27 Mar 2019 13:04:15 +0100 Subject: clone: add CLONE_PIDFD This patchset makes it possible to retrieve pid file descriptors at process creation time by introducing the new flag CLONE_PIDFD to the clone() system call. Linus originally suggested to implement this as a new flag to clone() instead of making it a separate system call. As spotted by Linus, there is exactly one bit for clone() left. CLONE_PIDFD creates file descriptors based on the anonymous inode implementation in the kernel that will also be used to implement the new mount api. They serve as a simple opaque handle on pids. Logically, this makes it possible to interpret a pidfd differently, narrowing or widening the scope of various operations (e.g. signal sending). Thus, a pidfd cannot just refer to a tgid, but also a tid, or in theory - given appropriate flag arguments in relevant syscalls - a process group or session. A pidfd does not represent a privilege. This does not imply it cannot ever be that way but for now this is not the case. A pidfd comes with additional information in fdinfo if the kernel supports procfs. The fdinfo file contains the pid of the process in the callers pid namespace in the same format as the procfs status file, i.e. "Pid:\t%d". As suggested by Oleg, with CLONE_PIDFD the pidfd is returned in the parent_tidptr argument of clone. This has the advantage that we can give back the associated pid and the pidfd at the same time. To remove worries about missing metadata access this patchset comes with a sample program that illustrates how a combination of CLONE_PIDFD, and pidfd_send_signal() can be used to gain race-free access to process metadata through /proc/. The sample program can easily be translated into a helper that would be suitable for inclusion in libc so that users don't have to worry about writing it themselves. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner Co-developed-by: Jann Horn Signed-off-by: Jann Horn Reviewed-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Thomas Gleixner Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro --- kernel/fork.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..e45f0acaf451 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1662,6 +1664,58 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +const struct file_operations pidfd_fops = { + .release = pidfd_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1674,13 +1728,14 @@ static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; @@ -1730,6 +1785,31 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -1936,6 +2016,22 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1996,7 +2092,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2111,6 +2207,9 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + ksys_close(pidfd); bad_fork_free_pid: cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) @@ -2176,7 +2275,7 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task); @@ -2223,7 +2322,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); -- cgit v1.2.3 From 2151ad1b067275730de1b38c7257478cae47d29e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Apr 2019 22:50:25 +0200 Subject: signal: support CLONE_PIDFD with pidfd_send_signal Let pidfd_send_signal() use pidfds retrieved via CLONE_PIDFD. With this patch pidfd_send_signal() becomes independent of procfs. This fullfils the request made when we merged the pidfd_send_signal() patchset. The pidfd_send_signal() syscall is now always available allowing for it to be used by users without procfs mounted or even users without procfs support compiled into the kernel. Signed-off-by: Christian Brauner Co-developed-by: Jann Horn Signed-off-by: Jann Horn Acked-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Thomas Gleixner Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro --- kernel/signal.c | 12 +++++++++--- kernel/sys_ni.c | 3 --- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..1581140f2d99 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -#ifdef CONFIG_PROC_FS /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid @@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) return copy_siginfo_from_user(kinfo, info); } +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + /** * sys_pidfd_send_signal - send a signal to a process through a task file * descriptor @@ -3586,7 +3593,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, return -EBADF; /* Is this a pidfd? */ - pid = tgid_pidfd_to_pid(f.file); + pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; @@ -3620,7 +3627,6 @@ err: fdput(f); return ret; } -#endif /* CONFIG_PROC_FS */ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d21f4befaea4..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -167,9 +167,6 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ -/* kernel/signal.c */ -COND_SYSCALL(pidfd_send_signal); - /* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); -- cgit v1.2.3 From 4dd537aca25dd2e0e8aca8b8923930cbe6240003 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:55:41 +0900 Subject: tracing: uprobes: Re-enable $comm support for uprobe events Since commit 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") dropped the $comm support from uprobe events, this re-enables it. For $comm support, uses strlcpy() instead of strncpy_from_user() to copy current task's comm. Because it is in the kernel space, strncpy_from_user() always fails to copy the comm. This also uses strlen() instead of strnlen_user() to measure the length of the comm. Note that this uses -ECOMM as a token value to fetch the comm string. If the user-space pointer points -ECOMM, it will be translated to task->comm. Link: http://lkml.kernel.org/r/155723734162.9149.4042756162201097965.stgit@devnote2 Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Reported-by: Andreas Ziegler Acked-by: Andreas Ziegler Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b7737666c1a8..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -124,6 +124,7 @@ struct fetch_insn { /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ #define FETCH_INSN_MAX 16 +#define FETCH_TOKEN_COMM (-ECOMM) /* Fetch type information table */ struct fetch_type { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cd8750a72768..eb7e06b54741 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (unlikely(!maxlen)) return -ENOMEM; - ret = strncpy_from_user(dst, src, maxlen); + if (addr == FETCH_TOKEN_COMM) + ret = strlcpy(dst, current->comm, maxlen); + else + ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr) int len; void __user *vaddr = (void __force __user *) addr; - len = strnlen_user(vaddr, MAX_STRING_SIZE); + if (addr == FETCH_TOKEN_COMM) + len = strlen(current->comm) + 1; + else + len = strnlen_user(vaddr, MAX_STRING_SIZE); return (len > MAX_STRING_SIZE) ? 0 : len; } @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_IMM: val = code->immediate; break; + case FETCH_OP_COMM: + val = FETCH_TOKEN_COMM; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; -- cgit v1.2.3 From 489fe0096b19b664b8f3bed0fd604d617a229b5a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:55:52 +0900 Subject: tracing: probeevent: Do not accumulate on ret variable Do not accumulate strlen result on "ret" local variable, because it is accumulated on "total" local variable for array case. Link: http://lkml.kernel.org/r/155723735237.9149.3192150444705457531.stgit@devnote2 Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4737bb8c07a3..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -88,7 +88,7 @@ stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { if (code->op == FETCH_OP_ST_STRING) { - ret += fetch_store_strlen(val + code->offset); + ret = fetch_store_strlen(val + code->offset); code++; goto array; } else -- cgit v1.2.3 From 3dd1f7f24f8ceec00bbbc364c2ac3c893f0fdc4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:56:02 +0900 Subject: tracing: probeevent: Fix to make the type of $comm string Fix to make the type of $comm "string". If we set the other type to $comm argument, it shows meaningless value or wrong data. Currently probe events allow us to set string array type (e.g. ":string[2]"), or other digit types like x8 on $comm. But since clearly $comm is just a string data, it should not be fetched by other types including array. Link: http://lkml.kernel.org/r/155723736241.9149.14582064184468574539.stgit@devnote2 Cc: Andreas Ziegler Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4cc2d467d34c..e0d1d5353464 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -533,13 +533,14 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } } - /* - * The default type of $comm should be "string", and it can't be - * dereferenced. - */ - if (!t && strcmp(arg, "$comm") == 0) + + /* Since $comm can not be dereferred, we can find $comm by strcmp */ + if (strcmp(arg, "$comm") == 0) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + return -EINVAL; parg->type = find_fetch_type("string"); - else + } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); -- cgit v1.2.3 From 5c173bedb24dfcc8e412d3c3f111c504e4408dd5 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 1 Nov 2018 11:46:40 -0400 Subject: ring-buffer: Fix mispelling of Calculate It's not "Caculate". Link: http://lkml.kernel.org/r/20181101154640.23162-1-tiny.windzz@gmail.com Signed-off-by: Yangtao Li Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -362,7 +362,7 @@ static void ring_buffer_producer(void) hit--; /* make it non zero */ } - /* Caculate the average time in nanosecs */ + /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); trace_printk("%ld ns per entry\n", avg); } -- cgit v1.2.3 From 0f5e5a3ab7fa1c09370a4d709ad6157457d5b8b6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Mar 2019 09:17:57 +0100 Subject: tracing: Eliminate const char[] auto variables Automatic const char[] variables cause unnecessary code generation. For example, the this_mod variable leads to 3f04: 48 b8 5f 5f 74 68 69 73 5f 6d movabs $0x6d5f736968745f5f,%rax # __this_m 3f0e: 4c 8d 44 24 02 lea 0x2(%rsp),%r8 3f13: 48 8d 7c 24 10 lea 0x10(%rsp),%rdi 3f18: 48 89 44 24 02 mov %rax,0x2(%rsp) 3f1d: 4c 89 e9 mov %r13,%rcx 3f20: b8 65 00 00 00 mov $0x65,%eax # e 3f25: 48 c7 c2 00 00 00 00 mov $0x0,%rdx 3f28: R_X86_64_32S .rodata.str1.1+0x18d 3f2c: be 48 00 00 00 mov $0x48,%esi 3f31: c7 44 24 0a 6f 64 75 6c movl $0x6c75646f,0xa(%rsp) # odul 3f39: 66 89 44 24 0e mov %ax,0xe(%rsp) i.e., the string gets built on the stack at runtime. Similar code can be found for the other instances I'm replacing here. Putting the string in .rodata reduces the combined .text+.rodata size and saves time and stack space at runtime. The simplest fix, and what I've done for the this_mod case, is to just make the variable static. However, for the "" case where the same string is used twice, that prevents the linker from merging those two literals, so instead use a macro - that also keeps the two instances automatically in sync (instead of only the compile-time strlen expression). Finally, for the two runs of spaces, it turns out that the "build these strings on the stack" is not the worst part of what gcc does - it turns print_func_help_header_irq() into "if (tgid) { /* print_event_info + five seq_printf calls */ } else { /* print event_info + another five seq_printf */}". Taking inspiration from a suggestion from Al Viro, use %.*s to make snprintf either stop after the first two spaces or print the whole string. As a bonus, the seq_printfs now fit on single lines (at least, they are not longer than the existing ones in the function just above), making it easier to see that the ascii art lines up. x86-64 defconfig + CONFIG_FUNCTION_TRACER: $ scripts/stackdelta /tmp/stackusage.{0,1} ./kernel/trace/ftrace.c ftrace_mod_callback 152 136 -16 ./kernel/trace/trace.c trace_default_header 56 32 -24 ./kernel/trace/trace.c tracing_mark_raw_write 96 72 -24 ./kernel/trace/trace.c tracing_mark_write 104 80 -24 bloat-o-meter add/remove: 1/0 grow/shrink: 0/4 up/down: 14/-375 (-361) Function old new delta this_mod - 14 +14 ftrace_mod_callback 577 542 -35 tracing_mark_raw_write 444 374 -70 tracing_mark_write 616 540 -76 trace_default_header 600 406 -194 Link: http://lkml.kernel.org/r/20190320081757.6037-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 34 +++++++++++++--------------------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 433a64f49532..7765a53f1006 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3875,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ - const char this_mod[] = "__this_module"; + static const char this_mod[] = "__this_module"; char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcb9adb44be9..3259019cc66d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3593,25 +3593,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char tgid_space[] = " "; - const char space[] = " "; + const char *space = " "; + int prec = tgid ? 10 : 2; print_event_info(buf, m); - seq_printf(m, "# %s _-----=> irqs-off\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s / _----=> need-resched\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s||| / delay\n", - tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", - tgid ? " TGID " : space); - seq_printf(m, "# | | %s | |||| | |\n", - tgid ? " | " : space); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -6342,13 +6335,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - const char faulted[] = ""; ssize_t written; int size; int len; /* Used in tracing_mark_raw_write() as well */ -#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ +#define FAULTED_STR "" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -6380,7 +6373,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else @@ -6421,7 +6414,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct ring_buffer *buffer; struct raw_data_entry *entry; - const char faulted[] = ""; unsigned long irq_flags; ssize_t written; int size; @@ -6461,7 +6453,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; -- cgit v1.2.3 From bfcd631eb6de474d8e097fd0f9f840fdf7272a1d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Nov 2018 13:23:12 +0000 Subject: tracing: Fix white space issues in parse_pred() function Trivial fix to clean up an indentation issue, a whole chunk of code has an extra space in the indentation. Link: http://lkml.kernel.org/r/20181109132312.20994-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 180ecb390baa..d3e59312ef40 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1222,30 +1222,30 @@ static int parse_pred(const char *str, void *data, * (perf doesn't use it) and grab everything. */ if (strcmp(field->name, "ip") != 0) { - parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); - goto err_free; - } - pred->fn = filter_pred_none; - - /* - * Quotes are not required, but if they exist then we need - * to read them till we hit a matching one. - */ - if (str[i] == '\'' || str[i] == '"') - q = str[i]; - else - q = 0; - - for (i++; str[i]; i++) { - if (q && str[i] == q) - break; - if (!q && (str[i] == ')' || str[i] == '&' || - str[i] == '|')) - break; - } - /* Skip quotes */ - if (q) - s++; + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); -- cgit v1.2.3 From 6fc2171c5c03672bae71d04a0f5fa88cc9c3b4e2 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 30 Nov 2018 15:56:22 +0100 Subject: tracing: Allow RCU to run between postponed startup tests When building a allmodconfig kernel for arm64 and boot that in qemu, CONFIG_FTRACE_STARTUP_TEST gets enabled and that takes time so the watchdog expires and prints out a message like this: 'watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]' Depending on what the what test gets called from init_trace_selftests() it stays minutes in the loop. Rework so that function cond_resched() gets called in the init_trace_selftests loop. Link: http://lkml.kernel.org/r/20181130145622.26334-1-anders.roxell@linaro.org Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Anders Roxell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3259019cc66d..4269af5905e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1722,6 +1722,10 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { -- cgit v1.2.3 From cbe08bcbbe787315c425dde284dcb715cfbf3f39 Mon Sep 17 00:00:00 2001 From: Elazar Leibovich Date: Mon, 31 Dec 2018 13:58:37 +0200 Subject: tracing: Fix partial reading of trace event's id file When reading only part of the id file, the ppos isn't tracked correctly. This is taken care by simple_read_from_buffer. Reading a single byte, and then the next byte would result EOF. While this seems like not a big deal, this breaks abstractions that reads information from files unbuffered. See for example https://github.com/golang/go/issues/29399 This code was mentioned as problematic in commit cd458ba9d5a5 ("tracing: Do not (ab)use trace_seq in event_id_read()") An example C code that show this bug is: #include #include #include #include #include #include int main(int argc, char **argv) { if (argc < 2) return 1; int fd = open(argv[1], O_RDONLY); char c; read(fd, &c, 1); printf("First %c\n", c); read(fd, &c, 1); printf("Second %c\n", c); } Then run with, e.g. sudo ./a.out /sys/kernel/debug/tracing/events/tcp/tcp_set_state/id You'll notice you're getting the first character twice, instead of the first two characters in the id file. Link: http://lkml.kernel.org/r/20181231115837.4932-1-elazar@lightbitslabs.com Cc: Orit Wasserman Cc: Oleg Nesterov Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 23725aeeab10b ("ftrace: provide an id file for each event") Signed-off-by: Elazar Leibovich Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 81c038ed6cee..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1319,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len; - if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV; -- cgit v1.2.3 From 8623b00676f16ed8972008095deca2c8e2b97a37 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 14 Jan 2019 22:34:08 -0600 Subject: tracing: Replace kzalloc with kcalloc Replace kzalloc() function with its 2-factor argument form, kcalloc(). This patch replaces cases of: kzalloc(a * b, gfp) with: kcalloc(a, b, gfp) This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190115043408.GA23456@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0d1d5353464..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -558,7 +558,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, parg->count); } - code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; @@ -637,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op = FETCH_OP_END; /* Shrink down the code buffer */ - parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) ret = -ENOMEM; else -- cgit v1.2.3 From b9416997603ef7e17d4de10b6408f19da2feb72c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Mon, 28 Jan 2019 17:55:53 -0800 Subject: tracing: Fix documentation about disabling options using trace_options To disable a tracing option using the trace_options file, the option name needs to be prefixed with 'no', and not suffixed, as the README states. Fix it. Link: http://lkml.kernel.org/r/154872690031.47356.5739053380942044586.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4269af5905e4..a3a6945a7732 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4755,7 +4755,7 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From fdc6bae940ee9eb869e493990540098b8c0fd6ab Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Wed, 17 Apr 2019 10:48:33 +0200 Subject: ntp: Allow TAI-UTC offset to be set to zero The ADJ_TAI adjtimex mode sets the TAI-UTC offset of the system clock. It is typically set by NTP/PTP implementations and it is automatically updated by the kernel on leap seconds. The initial value is zero (which applications may interpret as unknown), but this value cannot be set by adjtimex. This limitation seems to go back to the original "nanokernel" implementation by David Mills. Change the ADJ_TAI check to accept zero as a valid TAI-UTC offset in order to allow setting it back to the initial value. Fixes: 153b5d054ac2 ("ntp: support for TAI") Suggested-by: Ondrej Mosnacek Signed-off-by: Miroslav Lichvar Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Richard Cochran Cc: Prarit Bhargava Link: https://lkml.kernel.org/r/20190417084833.7401-1-mlichvar@redhat.com --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..f43d47c8c3b6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -690,7 +690,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant > 0) + if (txc->modes & ADJ_TAI && txc->constant >= 0) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) -- cgit v1.2.3 From f2b31bb598248c04721cb8485e6091a9feb045ac Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 8 May 2019 13:34:20 -0700 Subject: cgroup: never call do_group_exit() with task->frozen bit set I've got two independent reports that cgroup_task_frozen() check in cgroup_exit() has been triggered by lkp libhugetlbfs-test and LTP ptrace01 tests. For example: [ 44.576072] WARNING: CPU: 1 PID: 3028 at kernel/cgroup/cgroup.c:5932 cgroup_exit+0x148/0x160 [ 44.577724] Modules linked in: crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel sr_mod cdrom bochs_drm sg ttm ata_generic pata_acpi ppdev drm_kms_helper snd_pcm syscopyarea aesni_intel snd_timer sysfillrect sysimgblt snd crypto_simd cryptd glue_helper soundcore fb_sys_fops joydev drm serio_raw pcspkr ata_piix libata i2c_piix4 floppy parport_pc parport ip_tables [ 44.583106] CPU: 1 PID: 3028 Comm: ptrace-write-hu Not tainted 5.1.0-rc3-00053-g9262503 #5 [ 44.584600] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 44.586116] RIP: 0010:cgroup_exit+0x148/0x160 [ 44.587135] Code: 0f 84 50 ff ff ff 48 8b 85 c8 0c 00 00 48 8b 78 70 e8 ec 2e 00 00 e9 3b ff ff ff f0 ff 43 60 0f 88 72 21 89 00 e9 48 ff ff ff <0f> 0b e9 1b ff ff ff e8 3c 73 f4 ff 66 90 66 2e 0f 1f 84 00 00 00 [ 44.590113] RSP: 0018:ffffb25702dcfd30 EFLAGS: 00010002 [ 44.591167] RAX: ffff96a7fee32410 RBX: ffff96a7ff1d6000 RCX: dead000000000200 [ 44.592446] RDX: ffff96a7ff1d6080 RSI: ffff96a7fec75290 RDI: ffff96a7fec75290 [ 44.593715] RBP: ffff96a7fec745c0 R08: ffff96a7fec74658 R09: 0000000000000000 [ 44.594985] R10: 0000000000000000 R11: 0000000000000001 R12: ffff96a7fec75101 [ 44.596266] R13: ffff96a7fec745c0 R14: ffff96a7ff3bde30 R15: ffff96a7fec75130 [ 44.597550] FS: 0000000000000000(0000) GS:ffff96a7dd700000(0000) knlGS:0000000000000000 [ 44.598950] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 [ 44.600098] CR2: 00000000f7a00000 CR3: 000000000d20e000 CR4: 00000000000406e0 [ 44.601417] Call Trace: [ 44.602777] do_exit+0x337/0xc40 [ 44.603677] do_group_exit+0x3a/0xa0 [ 44.604610] get_signal+0x12e/0x8d0 [ 44.605533] ? __switch_to_asm+0x40/0x70 [ 44.606503] do_signal+0x36/0x650 [ 44.607409] ? __switch_to_asm+0x40/0x70 [ 44.608383] ? __schedule+0x267/0x860 [ 44.609329] exit_to_usermode_loop+0x89/0xf0 [ 44.610349] do_fast_syscall_32+0x251/0x2e3 [ 44.611357] entry_SYSENTER_compat+0x7f/0x91 [ 44.612376] ---[ end trace e4ca5cfc4b7f7964 ]--- The problem is caused by the ptrace_signal() call in the for loop in get_signal(). There is a cgroup_enter_frozen() call inside ptrace_signal(), so after exit from ptrace_signal() the task->frozen bit might be set. In this case do_group_exit() can be called with the task->frozen bit set and trigger the warning. This is only place where we can leave the loop with the task->frozen bit set and without setting JOBCTL_TRAP_FREEZE and TIF_SIGPENDING. To resolve this problem, let's move cgroup_leave_frozen(true) call to just after the fatal label. If the task is going to die, the frozen bit must be cleared no matter how we get into this point. Reported-by: kernel test robot Reported-by: Qian Cai Cc: Oleg Nesterov Cc: Tejun Heo Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/signal.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 16b72f4f14df..8607b11ff936 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2483,10 +2483,6 @@ relock: ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); recalc_sigpending(); - current->jobctl &= ~JOBCTL_TRAP_FREEZE; - spin_unlock_irq(&sighand->siglock); - if (unlikely(cgroup_task_frozen(current))) - cgroup_leave_frozen(true); goto fatal; } @@ -2608,8 +2604,10 @@ relock: continue; } - spin_unlock_irq(&sighand->siglock); fatal: + spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. -- cgit v1.2.3 From c3b7112df86b769927a60a6d7175988ca3d60f09 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 10 May 2019 11:53:46 +0200 Subject: fork: do not release lock that wasn't taken Avoid calling cgroup_threadgroup_change_end() without having called cgroup_threadgroup_change_begin() first. During process creation we need to check whether the cgroup we are in allows us to fork. To perform this check the cgroup needs to guard itself against threadgroup changes and takes a lock. Prior to CLONE_PIDFD the cleanup target "bad_fork_free_pid" would also need to call cgroup_threadgroup_change_end() because said lock had already been taken. However, this is not the case anymore with the addition of CLONE_PIDFD. We are now allocating a pidfd before we check whether the cgroup we're in can fork and thus prior to taking the lock. So when copy_process() fails at the right step it would release a lock we haven't taken. This bug is not even very subtle to be honest. It's just not very clear from the naming of cgroup_threadgroup_change_{begin,end}() that a lock is taken. Here's the relevant splat: entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7fec849 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078 RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000 RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(depth <= 0) WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052 __lock_release kernel/locking/lockdep.c:4052 [inline] WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052 lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 7744 Comm: syz-executor007 Not tainted 5.1.0+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x45 kernel/panic.c:566 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:179 [inline] fixup_bug arch/x86/kernel/traps.c:174 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:972 RIP: 0010:__lock_release kernel/locking/lockdep.c:4052 [inline] RIP: 0010:lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321 Code: 0f 85 a0 03 00 00 8b 35 77 66 08 08 85 f6 75 23 48 c7 c6 a0 55 6b 87 48 c7 c7 40 25 6b 87 4c 89 85 70 ff ff ff e8 b7 a9 eb ff <0f> 0b 4c 8b 85 70 ff ff ff 4c 89 ea 4c 89 e6 4c 89 c7 e8 52 63 ff RSP: 0018:ffff888094117b48 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 1ffff11012822f6f RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff815af236 RDI: ffffed1012822f5b RBP: ffff888094117c00 R08: ffff888092bfc400 R09: fffffbfff113301d R10: fffffbfff113301c R11: ffffffff889980e3 R12: ffffffff8a451df8 R13: ffffffff8142e71f R14: ffffffff8a44cc80 R15: ffff888094117bd8 percpu_up_read.constprop.0+0xcb/0x110 include/linux/percpu-rwsem.h:92 cgroup_threadgroup_change_end include/linux/cgroup-defs.h:712 [inline] copy_process.part.0+0x47ff/0x6710 kernel/fork.c:2222 copy_process kernel/fork.c:1772 [inline] _do_fork+0x25d/0xfd0 kernel/fork.c:2338 __do_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:240 [inline] __se_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:236 [inline] __ia32_compat_sys_x86_clone+0xbc/0x140 arch/x86/ia32/sys_ia32.c:236 do_syscall_32_irqs_on arch/x86/entry/common.c:334 [inline] do_fast_syscall_32+0x281/0xd54 arch/x86/entry/common.c:405 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7fec849 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078 RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000 RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Kernel Offset: disabled Rebooting in 86400 seconds.. Reported-and-tested-by: syzbot+3286e58549edc479faae@syzkaller.appspotmail.com Fixes: b3e583825266 ("clone: add CLONE_PIDFD") Signed-off-by: Christian Brauner --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5359facf9867..737db1828437 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2102,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_put_pidfd; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -2217,11 +2217,12 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_cgroup_threadgroup_change_end: + cgroup_threadgroup_change_end(current); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) ksys_close(pidfd); bad_fork_free_pid: - cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: -- cgit v1.2.3 From 56e33afd7757d5da2664fb797f2544ce167176be Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 10 May 2019 23:47:50 +0200 Subject: livepatch: Remove klp_check_compiler_support() The only purpose of klp_check_compiler_support() is to make sure that we are not using ftrace on x86 via mcount (because that's executed only after prologue has already happened, and that's too late for livepatching purposes). Now that mcount is not supported by ftrace any more, there is no need for klp_check_compiler_support() either. Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1905102346100.17054@cbobk.fhfr.pm Reported-by: Linus Torvalds Signed-off-by: Jiri Kosina Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..112a36ed4a09 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1220,14 +1220,6 @@ void klp_module_going(struct module *mod) static int __init klp_init(void) { - int ret; - - ret = klp_check_compiler_support(); - if (ret) { - pr_info("Your compiler is too old; turning off.\n"); - return -EINVAL; - } - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); if (!klp_root_kobj) return -ENOMEM; -- cgit v1.2.3 From af959b18fd447170a10865283ba691af4353cc7f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 11 May 2019 03:03:09 +0200 Subject: bpf: fix out of bounds backwards jmps due to dead code removal systemtap folks reported the following splat recently: [ 7790.862212] WARNING: CPU: 3 PID: 26759 at arch/x86/kernel/kprobes/core.c:1022 kprobe_fault_handler+0xec/0xf0 [...] [ 7790.864113] CPU: 3 PID: 26759 Comm: sshd Not tainted 5.1.0-0.rc7.git1.1.fc31.x86_64 #1 [ 7790.864198] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS[...] [ 7790.864314] RIP: 0010:kprobe_fault_handler+0xec/0xf0 [ 7790.864375] Code: 48 8b 50 [...] [ 7790.864714] RSP: 0018:ffffc06800bdbb48 EFLAGS: 00010082 [ 7790.864812] RAX: ffff9e2b75a16320 RBX: 0000000000000000 RCX: 0000000000000000 [ 7790.865306] RDX: ffffffffffffffff RSI: 000000000000000e RDI: ffffc06800bdbbf8 [ 7790.865514] RBP: ffffc06800bdbbf8 R08: 0000000000000000 R09: 0000000000000000 [ 7790.865960] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc06800bdbbf8 [ 7790.866037] R13: ffff9e2ab56a0418 R14: ffff9e2b6d0bb400 R15: ffff9e2b6d268000 [ 7790.866114] FS: 00007fde49937d80(0000) GS:ffff9e2b75a00000(0000) knlGS:0000000000000000 [ 7790.866193] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7790.866318] CR2: 0000000000000000 CR3: 000000012f312000 CR4: 00000000000006e0 [ 7790.866419] Call Trace: [ 7790.866677] do_user_addr_fault+0x64/0x480 [ 7790.867513] do_page_fault+0x33/0x210 [ 7790.868002] async_page_fault+0x1e/0x30 [ 7790.868071] RIP: 0010: (null) [ 7790.868144] Code: Bad RIP value. [ 7790.868229] RSP: 0018:ffffc06800bdbca8 EFLAGS: 00010282 [ 7790.868362] RAX: ffff9e2b598b60f8 RBX: ffffc06800bdbe48 RCX: 0000000000000004 [ 7790.868629] RDX: 0000000000000004 RSI: ffffc06800bdbc6c RDI: ffff9e2b598b60f0 [ 7790.868834] RBP: ffffc06800bdbcf8 R08: 0000000000000000 R09: 0000000000000004 [ 7790.870432] R10: 00000000ff6f7a03 R11: 0000000000000000 R12: 0000000000000001 [ 7790.871859] R13: ffffc06800bdbcb8 R14: 0000000000000000 R15: ffff9e2acd0a5310 [ 7790.873455] ? vfs_read+0x5/0x170 [ 7790.874639] ? vfs_read+0x1/0x170 [ 7790.875834] ? trace_call_bpf+0xf6/0x260 [ 7790.877044] ? vfs_read+0x1/0x170 [ 7790.878208] ? vfs_read+0x5/0x170 [ 7790.879345] ? kprobe_perf_func+0x233/0x260 [ 7790.880503] ? vfs_read+0x1/0x170 [ 7790.881632] ? vfs_read+0x5/0x170 [ 7790.882751] ? kprobe_ftrace_handler+0x92/0xf0 [ 7790.883926] ? __vfs_read+0x30/0x30 [ 7790.885050] ? ftrace_ops_assist_func+0x94/0x100 [ 7790.886183] ? vfs_read+0x1/0x170 [ 7790.887283] ? vfs_read+0x5/0x170 [ 7790.888348] ? ksys_read+0x5a/0xe0 [ 7790.889389] ? do_syscall_64+0x5c/0xa0 [ 7790.890401] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe After some debugging, turns out that the logic in 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") has a bug that is exposed after 52875a04f4b2 ("bpf: verifier: remove dead code") in that we miss some of the jump offset adjustments after code patching when we remove dead code, more concretely, upon backward jump spanning over the area that is being removed. BPF insns of a case that was hit pre 52875a04f4b2: [...] 676: (85) call bpf_perf_event_output#-47616 677: (05) goto pc-636 678: (62) *(u32 *)(r10 -64) = 0 679: (bf) r7 = r10 680: (07) r7 += -64 681: (05) goto pc-44 682: (05) goto pc-1 683: (05) goto pc-1 BPF insns afterwards: [...] 618: (85) call bpf_perf_event_output#-47616 619: (05) goto pc-638 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 To illustrate the bug, situation looks as follows: ____ 0 | | <-- foo: [...] 1 |____| 2 |____| <-- pos / end_new ^ 3 | | | 4 | | | len 5 |____| | (remove region) 6 | | <-- end_old v 7 | | 8 | | <-- curr (jmp foo) 9 |____| The condition curr >= end_new && curr + off + 1 < end_new in the branch delta adjustments is never hit because curr + off + 1 < end_new is compared as unsigned and therefore curr + off + 1 > end_new in unsigned realm as curr + off + 1 becomes negative since the insns are memmove()'d before the offset adjustments. Correct BPF insns after this fix: [...] 618: (85) call bpf_perf_event_output#-47216 619: (05) goto pc-578 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 Note that unprivileged case is not affected from this. Fixes: 52875a04f4b2 ("bpf: verifier: remove dead code") Fixes: 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") Reported-by: Frank Ch. Eigler Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3ba56e73c90e..242a643af82f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -338,7 +338,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; @@ -356,7 +356,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; -- cgit v1.2.3 From ecebc5ce59a003163eb608ace38a01d7ffeb0a95 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 22 Mar 2019 18:52:27 -0700 Subject: kdb: Get rid of broken attempt to print CCVERSION in kdb summary If you drop into kdb and type "summary", it prints out a line that says this: ccversion CCVERSION ...and I don't mean that it actually prints out the version of the C compiler. It literally prints out the string "CCVERSION". The version of the C Compiler is already printed at boot up and it doesn't seem useful to replicate this in kdb. Let's just delete it. We can also delete the bit of the Makefile that called the C compiler in an attempt to pass this into kdb. This will remove one extra call to the C compiler at Makefile parse time and (very slightly) speed up builds. Signed-off-by: Douglas Anderson Reviewed-by: Masahiro Yamada Signed-off-by: Daniel Thompson --- kernel/debug/kdb/Makefile | 1 - kernel/debug/kdb/kdb_main.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile @@ -6,7 +6,6 @@ # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. # -CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..fc96dbf8d9de 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("machine %s\n", init_uts_ns.name.machine); kdb_printf("nodename %s\n", init_uts_ns.name.nodename); kdb_printf("domainname %s\n", init_uts_ns.name.domainname); - kdb_printf("ccversion %s\n", __stringify(CCVERSION)); now = __ktime_get_real_seconds(); time64_to_tm(now, 0, &tm); -- cgit v1.2.3 From b586627e10f57ee3aa8f0cfab0d6f7dc4ae63760 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 May 2019 15:50:18 +0300 Subject: kdb: do a sanity check on the cpu in kdb_per_cpu() The "whichcpu" comes from argv[3]. The cpu_online() macro looks up the cpu in a bitmap of online cpus, but if the value is too high then it could read beyond the end of the bitmap and possibly Oops. Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") Signed-off-by: Dan Carpenter Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index fc96dbf8d9de..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2583,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv) diag = kdbgetularg(argv[3], &whichcpu); if (diag) return diag; - if (!cpu_online(whichcpu)) { + if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { kdb_printf("cpu %ld is not online\n", whichcpu); return KDB_BADCPUNUM; } -- cgit v1.2.3 From e2f7fc0ac6957cabff4cecf6c721979b571af208 Mon Sep 17 00:00:00 2001 From: Krzesimir Nowak Date: Wed, 8 May 2019 18:08:58 +0200 Subject: bpf: fix undefined behavior in narrow load handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") made the verifier add AND instructions to clear the unwanted bits with a mask when doing a narrow load. The mask is computed with (1 << size * 8) - 1 where "size" is the size of the narrow load. When doing a 4 byte load of a an 8 byte field the verifier shifts the literal 1 by 32 places to the left. This results in an overflow of a signed integer, which is an undefined behavior. Typically, the computed mask was zero, so the result of the narrow load ended up being zero too. Cast the literal to long long to avoid overflows. Note that narrow load of the 4 byte fields does not have the undefined behavior, because the load size can only be either 1 or 2 bytes, so shifting 1 by 8 or 16 places will not overflow it. And reading 4 bytes would not be a narrow load of a 4 bytes field. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Reviewed-by: Alban Crequy Reviewed-by: Iago López Galeiras Signed-off-by: Krzesimir Nowak Cc: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b05e8938d5c..95f9354495ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7599,7 +7599,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } } -- cgit v1.2.3 From 2baae3545327632167c0180e9ca1d467416f1919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 May 2019 09:59:16 -0700 Subject: bpf: devmap: fix use-after-free Read in __dev_map_entry_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit synchronize_rcu() is fine when the rcu callbacks only need to free memory (kfree_rcu() or direct kfree() call rcu call backs) __dev_map_entry_free() is a bit more complex, so we need to make sure that call queued __dev_map_entry_free() callbacks have completed. sysbot report: BUG: KASAN: use-after-free in dev_map_flush_old kernel/bpf/devmap.c:365 [inline] BUG: KASAN: use-after-free in __dev_map_entry_free+0x2a8/0x300 kernel/bpf/devmap.c:379 Read of size 8 at addr ffff8801b8da38c8 by task ksoftirqd/1/18 CPU: 1 PID: 18 Comm: ksoftirqd/1 Not tainted 4.17.0+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 dev_map_flush_old kernel/bpf/devmap.c:365 [inline] __dev_map_entry_free+0x2a8/0x300 kernel/bpf/devmap.c:379 __rcu_reclaim kernel/rcu/rcu.h:178 [inline] rcu_do_batch kernel/rcu/tree.c:2558 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2818 [inline] __rcu_process_callbacks kernel/rcu/tree.c:2785 [inline] rcu_process_callbacks+0xe9d/0x1760 kernel/rcu/tree.c:2802 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284 run_ksoftirqd+0x86/0x100 kernel/softirq.c:645 smpboot_thread_fn+0x417/0x870 kernel/smpboot.c:164 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 Allocated by task 6675: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:513 [inline] kzalloc include/linux/slab.h:706 [inline] dev_map_alloc+0x208/0x7f0 kernel/bpf/devmap.c:102 find_and_alloc_map kernel/bpf/syscall.c:129 [inline] map_create+0x393/0x1010 kernel/bpf/syscall.c:453 __do_sys_bpf kernel/bpf/syscall.c:2351 [inline] __se_sys_bpf kernel/bpf/syscall.c:2328 [inline] __x64_sys_bpf+0x303/0x510 kernel/bpf/syscall.c:2328 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 26: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 dev_map_free+0x4fa/0x670 kernel/bpf/devmap.c:191 bpf_map_free_deferred+0xba/0xf0 kernel/bpf/syscall.c:262 process_one_work+0xc64/0x1b70 kernel/workqueue.c:2153 worker_thread+0x181/0x13a0 kernel/workqueue.c:2296 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 The buggy address belongs to the object at ffff8801b8da37c0 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 264 bytes inside of 512-byte region [ffff8801b8da37c0, ffff8801b8da39c0) The buggy address belongs to the page: page:ffffea0006e368c0 count:1 mapcount:0 mapping:ffff8801da800940 index:0xffff8801b8da3540 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0007217b88 ffffea0006e30cc8 ffff8801da800940 raw: ffff8801b8da3540 ffff8801b8da3040 0000000100000004 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801b8da3780: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8801b8da3800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > ffff8801b8da3880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801b8da3900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b8da3980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Eric Dumazet Reported-by: syzbot+457d3e2ffbcf31aee5c0@syzkaller.appspotmail.com Acked-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 191b79948424..1e525d70f833 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -164,6 +164,9 @@ static void dev_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); + /* Make sure prior __dev_map_entry_free() have completed. */ + rcu_barrier(); + /* To ensure all pending flush operations have completed wait for flush * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected -- cgit v1.2.3 From 390e99cfdda1334f45c718cc02cd26eb3135f233 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 13 May 2019 12:04:36 -0700 Subject: bpf: mark bpf_event_notify and bpf_event_init as static Both of them are not declared in the headers and not used outside of bpf_trace.c file. Fixes: a38d1107f937c ("bpf: support raw tracepoints in modules") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b496ffdf5f36..f92d6ad5e080 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1297,7 +1297,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, } #ifdef CONFIG_MODULES -int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +static int bpf_event_notify(struct notifier_block *nb, unsigned long op, + void *module) { struct bpf_trace_module *btm, *tmp; struct module *mod = module; @@ -1336,7 +1337,7 @@ static struct notifier_block bpf_module_nb = { .notifier_call = bpf_event_notify, }; -int __init bpf_event_init(void) +static int __init bpf_event_init(void) { register_module_notifier(&bpf_module_nb); return 0; -- cgit v1.2.3 From ca976bfb3154c7bc67c4651ecd144fdf67ccaee7 Mon Sep 17 00:00:00 2001 From: Wenlin Kang Date: Mon, 13 May 2019 16:57:20 +0800 Subject: kdb: Fix bound check compiler warning The strncpy() function may leave the destination string buffer unterminated, better use strscpy() instead. This fixes the following warning with gcc 8.2: kernel/debug/kdb/kdb_io.c: In function 'kdb_getstr': kernel/debug/kdb/kdb_io.c:449:3: warning: 'strncpy' specified bound 256 equals destination size [-Wstringop-truncation] strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Wenlin Kang Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -446,7 +446,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); kdb_printf(kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); -- cgit v1.2.3 From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 13 May 2019 17:15:40 -0700 Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map() While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes. As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program: | "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0", Thus we need to change validate_prctl_map from strictly less to less or equal operator use. Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov Cc: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); -- cgit v1.2.3 From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. -- cgit v1.2.3 From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit v1.2.3 From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Mon, 13 May 2019 17:20:53 -0700 Subject: mm/mmu_notifier: use correct mmu_notifier events for each invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e34b699f3865..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit v1.2.3 From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { -- cgit v1.2.3 From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); -- cgit v1.2.3 From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:23:23 -0700 Subject: kernel/memremap.c: remove the unused device_private_entry_fault() export This export has been entirely unused since it was added more than 1 1/2 years ago. Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4e59d29245f4..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) -- cgit v1.2.3 From c6110222c6f49ea68169f353565eb865488a8619 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 May 2019 01:18:55 +0200 Subject: bpf: add map_lookup_elem_sys_only for lookups from syscall side Add a callback map_lookup_elem_sys_only() that map implementations could use over map_lookup_elem() from system call side in case the map implementation needs to handle the latter differently than from the BPF data path. If map_lookup_elem_sys_only() is set, this will be preferred pick for map lookups out of user space. This hook is used in a follow-up fix for LRU map, but once development window opens, we can convert other map types from map_lookup_elem() (here, the one called upon BPF_MAP_LOOKUP_ELEM cmd is meant) over to use the callback to simplify and clean up the latter. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ad3ccf82f31d..cb5440b02e82 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -808,7 +808,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { -- cgit v1.2.3 From 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 May 2019 01:18:56 +0200 Subject: bpf, lru: avoid messing with eviction heuristics upon syscall lookup One of the biggest issues we face right now with picking LRU map over regular hash table is that a map walk out of user space, for example, to just dump the existing entries or to remove certain ones, will completely mess up LRU eviction heuristics and wrong entries such as just created ones will get evicted instead. The reason for this is that we mark an entry as "in use" via bpf_lru_node_set_ref() from system call lookup side as well. Thus upon walk, all entries are being marked, so information of actual least recently used ones are "lost". In case of Cilium where it can be used (besides others) as a BPF based connection tracker, this current behavior causes disruption upon control plane changes that need to walk the map from user space to evict certain entries. Discussion result from bpfconf [0] was that we should simply just remove marking from system call side as no good use case could be found where it's actually needed there. Therefore this patch removes marking for regular LRU and per-CPU flavor. If there ever should be a need in future, the behavior could be selected via map creation flag, but due to mentioned reason we avoid this here. [0] http://vger.kernel.org/bpfconf.html Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH") Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 192d32e77db3..0f2708fde5f7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, + void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { - bpf_lru_node_set_ref(&l->lru_node); + if (mark) + bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, true); +} + +static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, false); +} + static u32 htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, @@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; - if (htab_is_lru(htab)) - bpf_lru_node_set_ref(&l->lru_node); + /* We do not mark LRU map element here in order to not mess up + * eviction heuristics when user space does a map walk. + */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, -- cgit v1.2.3 From acb2ec3dd003b50b6fb5772057a08ec0dc45d42a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 May 2019 15:40:43 -0700 Subject: kernel/Makefile: don't assume that kernel/gen_ikh_data.sh is executable If the user downloads and applies patch-5.1.gz using patch(1), the x bit on kernel/gen_ikh_data.sh is not set. /bin/sh: 1: ./kernel/gen_ikh_data.sh: Permission denied Fix this by using CONFIG_SHELL. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 298437bb2c6a..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) -- cgit v1.2.3 From c3f3ce049f7d97cc7ec9c01cb51d9ec74e0f37c2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:46 -0700 Subject: userfaultfd: use RCU to free the task struct when fork fails The task structure is freed while get_mem_cgroup_from_mm() holds rcu_read_lock() and dereferences mm->owner. get_mem_cgroup_from_mm() failing fork() ---- --- task = mm->owner mm->owner = NULL; free(task) if (task) *task; /* use after free */ The fix consists in freeing the task with RCU also in the fork failure case, exactly like it always happens for the regular exit(2) path. That is enough to make the rcu_read_lock hold in get_mem_cgroup_from_mm() (left side above) effective to avoid a use after free when dereferencing the task structure. An alternate possible fix would be to defer the delivery of the userfaultfd contexts to the monitor until after fork() is guaranteed to succeed. Such a change would require more changes because it would create a strict ordering dependency where the uffd methods would need to be called beyond the last potentially failing branch in order to be safe. This solution as opposed only adds the dependency to common code to set mm->owner to NULL and to free the task struct that was pointed by mm->owner with RCU, if fork ends up failing. The userfaultfd methods can still be called anywhere during the fork runtime and the monitor will keep discarding orphaned "mm" coming from failed forks in userland. This race condition couldn't trigger if CONFIG_MEMCG was set =n at build time. [aarcange@redhat.com: improve changelog, reduce #ifdefs per Michal] Link: http://lkml.kernel.org/r/20190429035752.4508-1-aarcange@redhat.com Link: http://lkml.kernel.org/r/20190325225636.11635-2-aarcange@redhat.com Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event") Signed-off-by: Andrea Arcangeli Tested-by: zhong jiang Reported-by: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Oleg Nesterov Cc: Jann Horn Cc: Hugh Dickins Cc: Mike Rapoport Cc: Mike Kravetz Cc: Peter Xu Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: zhong jiang Cc: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..b409e792aadc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid) return fd; } +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2233,8 +2258,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2265,7 +2292,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); -- cgit v1.2.3 From 987717e5e016a0dd3011d3bb16546672713f94e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:50 -0700 Subject: mm: change mm_update_next_owner() to update mm->owner with WRITE_ONCE The RCU reader uses rcu_dereference() inside rcu_read_lock critical sections, so the writer shall use WRITE_ONCE. Just a cleanup, we still rely on gcc to emit atomic writes in other places. Link: http://lkml.kernel.org/r/20190325225636.11635-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Jann Horn Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Oleg Nesterov Cc: Peter Xu Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } -- cgit v1.2.3 From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:56 -0700 Subject: psi: introduce state_mask to represent stalled psi states Patch series "psi: pressure stall monitors", v6. This is a respin of: https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/ Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high-enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straight-forward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000" fd = open("/proc/pressure/memory") write(fd, trigger, sizeof(trigger)) while (poll() >= 0) { ... }; close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-6 prepare the psi code for polling support. Patch 7 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 7): The psi monitoring patches will need to determine the same states as record_times(). To avoid calculating them twice, maintain a state mask that can be consulted cheaply. Do this in a separate patch to keep the churn in the main feature patch at a minimum. This adds 4-byte state_mask member into psi_group_cpu struct which results in its first cacheline-aligned part becoming 52 bytes long. Add explicit values to enumeration element counters that affect psi_group_cpu struct size. Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..22c1505ad290 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) static void get_recent_times(struct psi_group *group, int cpu, u32 *times) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; delta = times[s] - groupc->times_prev[s]; @@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } @@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); } -- cgit v1.2.3 From 9289c5e6a78a5a9397df5fa60eb82b105abcfecf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:59 -0700 Subject: psi: make psi_enable static psi_enable is not used outside of psi.c, make it static. Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 22c1505ad290..281702de9772 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,9 +140,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { -- cgit v1.2.3 From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:02 -0700 Subject: psi: rename psi fields in preparation for psi trigger addition Rename psi_group structure member fields used for calculating psi totals and averages for clear distinction between them and for trigger-related fields that will be added by "psi: introduce psi monitor". [surenb@google.com: v6] Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 281702de9772..4fb4d9913bc8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -165,7 +165,7 @@ static struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +173,9 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); } void __init psi_init(void) @@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group) int cpu; int s; - mutex_lock(&group->stat_lock); + mutex_lock(&group->avgs_lock); /* * Collect the per-cpu time buckets and average them into a @@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group) /* avgX= */ now = sched_clock(); - expires = group->next_update; + expires = group->avg_next_update; if (now < expires) goto out; if (now - expires >= psi_period) @@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } out: - mutex_unlock(&group->stat_lock); + mutex_unlock(&group->avgs_lock); return nonidle_total; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; bool nonidle; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); /* * If there is task activity, periodically fold the per-cpu @@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work) u64 now; now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; + if (group->avg_next_update > now) + delay = nsecs_to_jiffies( + group->avg_next_update - now) + 1; schedule_delayed_work(dwork, delay); } } @@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); } -- cgit v1.2.3 From 7fc70a3999366560ad1d4f2389a78360300c2c6a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:06 -0700 Subject: psi: split update_stats into parts Split update_stats into collect_percpu_times and update_averages for collect_percpu_times to be reused later inside psi monitor. Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4fb4d9913bc8..ace5ed97b186 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static bool collect_percpu_times(struct psi_group *group) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; int cpu; int s; - mutex_lock(&group->avgs_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + return nonidle_total; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + /* avgX= */ - now = sched_clock(); expires = group->avg_next_update; - if (now < expires) - goto out; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + avg_next_update = expires + ((1 + missed_periods) * psi_period); period = now - (group->avg_last_update + (missed_periods * psi_period)); group->avg_last_update = now; @@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group) group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->avgs_lock); - return nonidle_total; + + return avg_next_update; } static void psi_avgs_work(struct work_struct *work) @@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; bool nonidle; + u64 now; dwork = to_delayed_work(work); group = container_of(dwork, struct psi_group, avgs_work); + mutex_lock(&group->avgs_lock); + + now = sched_clock(); + + nonidle = collect_percpu_times(group); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; - - now = sched_clock(); - if (group->avg_next_update > now) - delay = nsecs_to_jiffies( - group->avg_next_update - now) + 1; - schedule_delayed_work(dwork, delay); + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); } + + mutex_unlock(&group->avgs_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; -- cgit v1.2.3 From 333f3017c5a893b000b2b4a3529814ce93fa83d7 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:09 -0700 Subject: psi: track changed states Introduce changed_states parameter into collect_percpu_times to track the states changed since the last update. This will be needed to detect whether polled states activated in the monitor patch. Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ace5ed97b186..1b99eeffaa25 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); u64 now, state_start; @@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) unsigned int seq; u32 state_mask; + *pchanged_states = 0; + /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); @@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) groupc->times_prev[s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool collect_percpu_times(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; + u32 changed_states = 0; int cpu; int s; @@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); - return nonidle_total; + if (pchanged_states) + *pchanged_states = changed_states; } static u64 update_averages(struct psi_group *group, u64 now) @@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; u64 now; @@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); - nonidle = collect_percpu_times(group); + collect_percpu_times(group, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group); + collect_percpu_times(group, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); -- cgit v1.2.3 From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:12 -0700 Subject: include/: refactor headers to allow kthread.h inclusion in psi_types.h kthread.h can't be included in psi_types.h because it creates a circular inclusion with kthread.h eventually including psi_types.h and complaining on kthread structures not being defined because they are defined further in the kthread.h. Resolve this by removing psi_types.h inclusion from the headers included from kthread.h. Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:15 -0700 Subject: psi: introduce psi monitor Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window. Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently. Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s. When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing. Notifications to the users are rate-limited to one per tracking window. Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 71 ++++++- kernel/sched/psi.c | 494 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 547 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 327f37c9fdfa..1140357d46f4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { @@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = { .name = "io.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1b99eeffaa25..e88918e0bb6d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner * + * Polling support by Suren Baghdasaryan + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include #include #include +#include #include #include #include +#include +#include +#include #include #include "sched.h" @@ -156,6 +163,11 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; @@ -176,6 +188,17 @@ static void group_init(struct psi_group *group) group->avg_next_update = sched_clock() + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times, +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); @@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times, if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; if (delta) @@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; @@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) u32 nonidle; u32 cpu_changed_states; - get_recent_times(group, cpu, times, + get_recent_times(group, cpu, aggregator, times, &cpu_changed_states); changed_states |= cpu_changed_states; @@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); if (pchanged_states) *pchanged_states = changed_states; @@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now) for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->avg_total[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); - collect_percpu_times(group, &changed_states); + collect_percpu_times(group, PSI_AVGS, &changed_states); nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu @@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); + + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; + } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; @@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } @@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup) cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group, NULL); + collect_percpu_times(group, PSI_AVGS, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); @@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) -- cgit v1.2.3 From df5ba5be7425e1df296d40c5f37a39d98ec666a2 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Tue, 14 May 2019 15:41:18 -0700 Subject: kernel/sched/psi.c: expose pressure metrics on root cgroup Pressure metrics are already recorded and exposed in procfs for the entire system, but any tool which monitors cgroup pressure has to special case the root cgroup to read from procfs. This patch exposes the already recorded pressure metrics on the root cgroup. Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com Signed-off-by: Dan Schatzberg Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 18 ++++++++++++------ kernel/sched/psi.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1140357d46f4..217cec4e22c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3540,15 +3540,24 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, @@ -4801,7 +4810,6 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4809,7 +4817,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4817,7 +4824,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e88918e0bb6d..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -173,7 +173,7 @@ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -- cgit v1.2.3 From 831246570d34692e0550da952d0655bdcc985419 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 14 May 2019 15:42:28 -0700 Subject: kernel/notifier.c: double register detection By design notifiers can be registerd once only, 2nd register attempt called by mistake silently corrupts notifiers list. A few years ago I investigated described problem, the host was power cycled because of notifier list corruption. I've prepared this patch and applied it to the OpenVZ kernel and sent this patch but nobody commented on it. Later it helped us to detect a similar problem in the OpenVz kernel. Mistakes with notifier registration can happen for example during subsystem initialization from different namespaces, or because of a lost unregister in the roll-back path on initialization failures. The proposed check cannot prevent the described problem, however it allows us to detect its reason quickly without coredump analysis. Link: http://lkml.kernel.org/r/04127e71-4782-9bbb-fe5a-7c01e93a99b0@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..bfc95b3e4235 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); -- cgit v1.2.3 From 0cc75888dad112395df0ac755915ceb79811831c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 14 May 2019 15:42:31 -0700 Subject: kernel/latencytop.c: remove unnecessary checks for latencytop_enabled 1. In latencytop source codes, we only have such calling chain: account_scheduler_latency(struct task_struct *task, int usecs, int inter) { if (unlikely(latencytop_enabled)) /* the outtermost check */ __account_scheduler_latency(task, usecs, inter); } __account_scheduler_latency account_global_scheduler_latency if (!latencytop_enabled) So, the inner check for latencytop_enabled is not necessary at all. 2. In clear_all_latency_tracing and now is called clear_tsk_latency_tracing the check for latencytop_enabled is redundant and buggy to some extent. We have no reason to refuse clearing the /proc/$pid/latency if latencytop_enabled is set to 0, considering that if we use latencytop manually by echo 0 > /proc/sys/kernel/latencytop, then we want to clear /proc/$pid/latency and failed. Also we don't have such check in brother function clear_global_latency_tracing. Notes: These changes are only visible to users who set CONFIG_LATENCYTOP and won't change user tool latencytop's behavior. Link: http://lkml.kernel.org/r/20190226114602.16902-2-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: Fabian Frederick Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..bbde5614da71 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -71,9 +71,6 @@ void clear_all_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; -- cgit v1.2.3 From e02c9b0d65a7493180db45320f82482c6ba8ea57 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 14 May 2019 15:42:34 -0700 Subject: kernel/latencytop.c: rename clear_all_latency_tracing to clear_tsk_latency_tracing The name clear_all_latency_tracing is misleading, in fact which only clear per task's latency_record[], and we do have another function named clear_global_latency_tracing which clear the global latency_record[] buffer. Link: http://lkml.kernel.org/r/20190226114602.16902-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: Fabian Frederick Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/latencytop.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b409e792aadc..b4cba953040a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2093,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index bbde5614da71..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,7 +67,7 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; -- cgit v1.2.3 From 6c4e121fda519e0da5c6755a60fdef8cd39634ae Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 14 May 2019 15:42:37 -0700 Subject: kernel/user.c: clean up some leftover code The out_unlock label is misleading; no unlocking happens after it, so just return NULL directly. Also, nothing between the kmem_cache_zalloc() that creates new and the two key_put() can initialize new->uid_keyring or new->session_keyring, so those calls are no-ops. Link: http://lkml.kernel.org/r/20190424200404.9114-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Andrew Morton Cc: "Peter Zijlstra (Intel)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..88b834f0eebc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) -- cgit v1.2.3 From b028fb612849add771679c1b99a50d99264c9632 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 14 May 2019 15:44:35 -0700 Subject: kernel/signal.c: annotate implicit fall through There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/signal.c:795:13: warning: this statement may fall through [-Wimplicit-fallthrough=] Link: http://lkml.kernel.org/r/20190114203505.17875-1-malat@debian.org Signed-off-by: Mathieu Malaterre Acked-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 62f9aea4a15a..c4dd66436fc5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } -- cgit v1.2.3 From 475dae385497dde3f25271ce77b526a1e54a472a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 May 2019 15:44:52 -0700 Subject: kernel/sysctl.c: switch to bitmap_zalloc() Switch to bitmap_zalloc() to show clearly what we are allocating. Besides that it returns pointer of bitmap type instead of opaque void *. Link: http://lkml.kernel.org/r/20190304094037.57756-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba158f61aab4..d82f9161adb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3178,9 +3178,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3271,7 +3269,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } -- cgit v1.2.3 From e260ad01f0aa9e96b5386d5cd7184afd949dc457 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 14 May 2019 15:44:55 -0700 Subject: sysctl: return -EINVAL if val violates minmax Currently when userspace gives us a values that overflow e.g. file-max and other callers of __do_proc_doulongvec_minmax() we simply ignore the new value and leave the current value untouched. This can be problematic as it gives the illusion that the limit has indeed be bumped when in fact it failed. This commit makes sure to return EINVAL when an overflow is detected. Please note that this is a userspace facing change. Link: http://lkml.kernel.org/r/20190210203943.8227-4-christian@brauner.io Signed-off-by: Christian Brauner Acked-by: Luis Chamberlain Cc: Kees Cook Cc: Alexey Dobriyan Cc: Al Viro Cc: Dominik Brodowski Cc: "Eric W. Biederman" Cc: Joe Lawrence Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d82f9161adb8..f7bd1aead3bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2886,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; -- cgit v1.2.3 From 3116ad38f51c98c81175151bd7358858a92a6031 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 14 May 2019 15:45:13 -0700 Subject: kernel/sysctl.c: fix proc_do_large_bitmap for large input buffers Today, proc_do_large_bitmap() truncates a large write input buffer to PAGE_SIZE - 1, which may result in misparsed numbers at the (truncated) end of the buffer. Further, it fails to notify the caller that the buffer was truncated, so it doesn't get called iteratively to finish the entire input buffer. Tell the caller if there's more work to do by adding the skipped amount back to left/*lenp before returning. To fix the misparsing, reset the position if we have completely consumed a truncated buffer (or if just one char is left, which may be a "-" in a range), and ask the caller to come back for more. Link: http://lkml.kernel.org/r/20190320222831.8243-7-mcgrof@kernel.org Signed-off-by: Eric Sandeen Signed-off-by: Luis Chamberlain Acked-by: Kees Cook Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f7bd1aead3bf..943c89178e3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3172,9 +3172,13 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) @@ -3189,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3209,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3227,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; -- cgit v1.2.3 From 1fd402df4586bcc239298081449ce58a78211626 Mon Sep 17 00:00:00 2001 From: Timmy Li Date: Tue, 14 May 2019 15:45:16 -0700 Subject: kernel/pid.c: remove unneeded hash header file Hash functions are not needed since idr is used now. Let's remove hash header file for cleanup. Link: http://lkml.kernel.org/r/20190430053319.95913-1-scuttimmy@gmail.com Signed-off-by: Timmy Li Cc: "Eric W. Biederman" Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleg Nesterov Cc: Mike Rapoport Cc: KJ Tsanaktsidis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..89548d35eefb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 826eba0d77bc74c4d1c611374b76abfe251e8538 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 14 May 2019 15:45:25 -0700 Subject: gcov: clang: move common GCC code into gcc_base.c Patch series "gcov: add Clang support", v4. This patch (of 3): base.c contains a few callbacks specific to GCC's gcov implementation. Move these into their own module in preparation for Clang support. Link: http://lkml.kernel.org/r/20190318025411.98014-2-trong@android.com Signed-off-by: Greg Hackmann Signed-off-by: Nick Desaulniers Signed-off-by: Tri Vo Tested-by: Trilok Soni Tested-by: Prasad Sodagudi Tested-by: Tri Vo Reviewed-by: Peter Oberparleiter Cc: Daniel Mentz Cc: Petri Gynther Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Makefile | 4 +-- kernel/gcov/base.c | 84 ++---------------------------------------------- kernel/gcov/gcc_base.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/gcov/gcov.h | 3 ++ 4 files changed, 93 insertions(+), 84 deletions(-) create mode 100644 kernel/gcov/gcc_base.c (limited to 'kernel') diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..45431ed679d1 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,5 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..799d42072727 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..0ecf1d664ec3 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -83,4 +83,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ -- cgit v1.2.3 From e178a5beb36960902379040ee0b667fb0a8eee93 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 14 May 2019 15:45:31 -0700 Subject: gcov: clang support LLVM uses profiling data that's deliberately similar to GCC, but has a very different way of exporting that data. LLVM calls llvm_gcov_init() once per module, and provides a couple of callbacks that we can use to ask for more data. We care about the "writeout" callback, which in turn calls back into compiler-rt/this module to dump all the gathered coverage data to disk: llvm_gcda_start_file() llvm_gcda_emit_function() llvm_gcda_emit_arcs() llvm_gcda_emit_function() llvm_gcda_emit_arcs() [... repeats for each function ...] llvm_gcda_summary_info() llvm_gcda_end_file() This design is much more stateless and unstructured than gcc's, and is intended to run at process exit. This forces us to keep some local state about which module we're dealing with at the moment. On the other hand, it also means we don't depend as much on how LLVM represents profiling data internally. See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more details on how this works, particularly GCOVProfiler::emitProfileArcs(), GCOVProfiler::insertCounterWriteout(), and GCOVProfiler::insertFlush(). [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190417225328.208129-1-trong@android.com Signed-off-by: Greg Hackmann Signed-off-by: Nick Desaulniers Signed-off-by: Tri Vo Co-developed-by: Nick Desaulniers Co-developed-by: Tri Vo Tested-by: Trilok Soni Tested-by: Prasad Sodagudi Tested-by: Tri Vo Tested-by: Daniel Mentz Tested-by: Petri Gynther Reviewed-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 3 +- kernel/gcov/Makefile | 1 + kernel/gcov/base.c | 2 +- kernel/gcov/clang.c | 581 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/gcov/gcc_3_4.c | 12 ++ kernel/gcov/gcc_4_7.c | 12 ++ kernel/gcov/gcov.h | 2 + 7 files changed, 611 insertions(+), 2 deletions(-) create mode 100644 kernel/gcov/clang.c (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..f71c1adcff31 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +63,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 45431ed679d1..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -4,3 +4,4 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 799d42072727..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -64,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 0ecf1d664ec3..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include #include /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { -- cgit v1.2.3 From c39ea0b9dd24bf1bf2baa5cdbfa1905f3065347b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 14 May 2019 15:45:34 -0700 Subject: panic: avoid the extra noise dmesg When kernel panic happens, it will first print the panic call stack, then the ending msg like: [ 35.743249] ---[ end Kernel panic - not syncing: Fatal exception [ 35.749975] ------------[ cut here ]------------ The above message are very useful for debugging. But if system is configured to not reboot on panic, say the "panic_timeout" parameter equals 0, it will likely print out many noisy message like WARN() call stack for each and every CPU except the panic one, messages like below: WARNING: CPU: 1 PID: 280 at kernel/sched/core.c:1198 set_task_cpu+0x183/0x190 Call Trace: try_to_wake_up default_wake_function autoremove_wake_function __wake_up_common __wake_up_common_lock __wake_up wake_up_klogd_work_func irq_work_run_list irq_work_tick update_process_times tick_sched_timer __hrtimer_run_queues hrtimer_interrupt smp_apic_timer_interrupt apic_timer_interrupt For people working in console mode, the screen will first show the panic call stack, but immediately overridden by these noisy extra messages, which makes debugging much more difficult, as the original context gets lost on screen. Also these noisy messages will confuse some users, as I have seen many bug reporters posted the noisy message into bugzilla, instead of the real panic call stack and context. Adding a flag "suppress_printk" which gets set in panic() to avoid those noisy messages, without changing current kernel behavior that both panic blinking and sysrq magic key can work as is, suggested by Petr Mladek. To verify this, make sure kernel is not configured to reboot on panic and in console # echo c > /proc/sysrq-trigger to see if console only prints out the panic call stack. Link: http://lkml.kernel.org/r/1551430186-24169-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Kees Cook Cc: Borislav Petkov Cc: Andi Kleen Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 +++ kernel/printk/printk.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..a6145050a8da 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -321,6 +321,9 @@ void panic(const char *fmt, ...) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..17102fd4c136 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; -- cgit v1.2.3 From b287a25a7148a89d977c819c1f7d6584f875b682 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 14 May 2019 15:45:37 -0700 Subject: panic/reboot: allow specifying reboot_mode for panic only Allow specifying reboot_mode for panic only. This is needed on systems where ramoops is used to store panic logs, and user wants to use warm reset to preserve those, while still having cold reset on normal reboots. Link: http://lkml.kernel.org/r/20190322004735.27702-1-aaro.koskinen@iki.fi Signed-off-by: Aaro Koskinen Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ kernel/reboot.c | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a6145050a8da..8779d64bace0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -306,6 +306,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..b9e79e8c7226 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +529,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +563,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': -- cgit v1.2.3 From 89963adcdb430e047f4c03ac3ed6ce9aa42a595c Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 15 May 2019 15:23:52 +1000 Subject: kernel/compat.c: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch aims to suppress 3 missing-break-in-switch false positives on some architectures. Acked-by: Arnd Bergmann Cc: Deepa Dinamani Cc: Gustavo A. R. Silva Cc: Kees Cook Cc: Jann Horn Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/compat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d8a36c6ad7c9..b5f7063c0db6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -346,8 +346,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + /* fall through */ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + /* fall through */ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + /* fall through */ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else -- cgit v1.2.3 From 05b289263772b0698589abc47771264a685cd365 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 16 May 2019 10:38:21 -0700 Subject: signal: unconditionally leave the frozen state in ptrace_stop() Alex Xu reported a regression in strace, caused by the introduction of the cgroup v2 freezer. The regression can be reproduced by stracing the following simple program: #include int main() { write(1, "a", 1); return 0; } An attempt to run strace ./a.out leads to the infinite loop: [ pre-main omitted ] write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) [ repeats forever ] The problem occurs because the traced task leaves ptrace_stop() (and the signal handling loop) with the frozen bit set. So let's call cgroup_leave_frozen(true) unconditionally after sleeping in ptrace_stop(). With this patch applied, strace works as expected: [ pre-main omitted ] write(1, "a", 1) = 1 exit_group(0) = ? +++ exited with 0 +++ Reported-by: Alex Xu Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer") Signed-off-by: Roman Gushchin Acked-by: Oleg Nesterov Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c4dd66436fc5..a1eb44dc9ff5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2113,6 +2113,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_enable_no_resched(); cgroup_enter_frozen(); freezable_schedule(); + cgroup_leave_frozen(true); } else { /* * By the time we got the lock, our tracer went away. -- cgit v1.2.3 From e547ff3f803e779a3898f1f48447b29f43c54085 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Tue, 14 May 2019 19:42:57 -0700 Subject: bpf: relax inode permission check for retrieving bpf program For iptable module to load a bpf program from a pinned location, it only retrieve a loaded program and cannot change the program content so requiring a write permission for it might not be necessary. Also when adding or removing an unrelated iptable rule, it might need to flush and reload the xt_bpf related rules as well and triggers the inode permission check. It might be better to remove the write premission check for the inode so we won't need to grant write access to all the processes that flush and restore iptables rules. Signed-off-by: Chenbo Feng Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bc53e5b20ddc..84a80b02db99 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -518,7 +518,7 @@ out: static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + int ret = inode_permission(inode, MAY_READ); if (ret) return ERR_PTR(ret); -- cgit v1.2.3 From de6da1e8bcf0dd2058b950b127491821207679dc Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 17 May 2019 14:31:50 -0700 Subject: panic: add an option to replay all the printk message in buffer Currently on panic, kernel will lower the loglevel and print out pending printk msg only with console_flush_on_panic(). Add an option for users to configure the "panic_print" to replay all dmesg in buffer, some of which they may have never seen due to the loglevel setting, which will help panic debugging . [feng.tang@intel.com: keep the original console_flush_on_panic() inside panic()] Link: http://lkml.kernel.org/r/1556199137-14163-1-git-send-email-feng.tang@intel.com [feng.tang@intel.com: use logbuf lock to protect the console log index] Link: http://lkml.kernel.org/r/1556269868-22654-1-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1556095872-36838-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Petr Mladek Cc: Aaro Koskinen Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Kees Cook Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 +++++- kernel/printk/printk.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8779d64bace0..b4543a31a495 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(void) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -277,7 +281,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 17102fd4c136..a6e06fe38e41 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2535,10 +2535,11 @@ void console_unblank(void) /** * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail @@ -2549,6 +2550,15 @@ void console_flush_on_panic(void) */ console_trylock(); console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { + unsigned long flags; + + logbuf_lock_irqsave(flags); + console_seq = log_first_seq; + console_idx = log_first_idx; + logbuf_unlock_irqrestore(flags); + } console_unlock(); } -- cgit v1.2.3 From 7eaf51a2e094229b75cc0c315f1cbbe2f3960058 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 May 2019 14:51:17 -0400 Subject: stacktrace: Unbreak stack_trace_save_tsk_reliable() Miroslav reported that the livepatch self-tests were failing, specifically a case in which the consistency model ensures that a current executing function is not allowed to be patched, "TEST: busy target module". Recent renovations of stack_trace_save_tsk_reliable() left it returning only an -ERRNO success indication in some configuration combinations: klp_check_stack() ret = stack_trace_save_tsk_reliable() #ifdef CONFIG_ARCH_STACKWALK && CONFIG_HAVE_RELIABLE_STACKTRACE stack_trace_save_tsk_reliable() ret = arch_stack_walk_reliable() return 0 return -EINVAL ... return ret; ... if (ret < 0) /* stack_trace_save_tsk_reliable error */ nr_entries = ret; << 0 Previously (and currently for !CONFIG_ARCH_STACKWALK && CONFIG_HAVE_RELIABLE_STACKTRACE) stack_trace_save_tsk_reliable() returned the number of entries that it consumed in the passed storage array. In the case of the above config and trace, be sure to return the stacktrace_cookie.len on stack_trace_save_tsk_reliable() success. Fixes: 25e39e32b0a3f ("livepatch: Simplify stack trace retrieval") Reported-by: Miroslav Benes Signed-off-by: Joe Lawrence Signed-off-by: Thomas Gleixner Reviewed-by: Kamalesh Babulal Acked-by: Josh Poimboeuf Cc: live-patching@vger.kernel.org Cc: jikos@kernel.org Cc: pmladek@suse.com Link: https://lkml.kernel.org/r/20190517185117.24642-1-joe.lawrence@redhat.com --- kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..90d3e0bf0302 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -206,7 +206,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, ret = arch_stack_walk_reliable(consume_entry, &c, tsk); put_task_stack(tsk); - return ret; + return ret ? ret : c.len; } #endif -- cgit v1.2.3 From 457c89965399115e5cd8bf38f9c597293405703d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 13:08:55 +0100 Subject: treewide: Add SPDX license identifier for missed files Add SPDX license identifiers to all files which: - Have no license information of any form - Have EXPORT_.*_SYMBOL_GPL inside which was used in the initial scan/conversion to ignore the file These files fall under the project license, GPL v2 only. The resulting SPDX license identifier is: GPL-2.0-only Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/tnum.c | 1 + kernel/cgroup/cgroup-v1.c | 1 + kernel/cgroup/rstat.c | 1 + kernel/context_tracking.c | 1 + kernel/crash_dump.c | 1 + kernel/dma/swiotlb.c | 1 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/freezer.c | 1 + kernel/hung_task.c | 1 + kernel/irq_work.c | 1 + kernel/jump_label.c | 1 + kernel/kallsyms.c | 1 + kernel/kthread.c | 1 + kernel/locking/lockdep.c | 1 + kernel/locking/mutex.c | 1 + kernel/locking/percpu-rwsem.c | 1 + kernel/locking/rtmutex.c | 1 + kernel/notifier.c | 1 + kernel/panic.c | 1 + kernel/pid.c | 1 + kernel/pid_namespace.c | 1 + kernel/power/qos.c | 1 + kernel/printk/printk.c | 1 + kernel/profile.c | 1 + kernel/ptrace.c | 1 + kernel/reboot.c | 1 + kernel/resource.c | 1 + kernel/sched/clock.c | 1 + kernel/sched/core.c | 1 + kernel/sched/cputime.c | 1 + kernel/sched/idle.c | 1 + kernel/sched/isolation.c | 1 + kernel/sched/wait.c | 1 + kernel/sched/wait_bit.c | 1 + kernel/signal.c | 1 + kernel/smp.c | 1 + kernel/smpboot.c | 1 + kernel/stacktrace.c | 1 + kernel/sysctl.c | 1 + kernel/umh.c | 1 + kernel/up.c | 1 + kernel/user-return-notifier.c | 1 + kernel/user.c | 1 + kernel/workqueue.c | 1 + 45 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..ca52b9642943 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 68ca5de7ec27..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bb95a35e8c2d..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9ad37b9e44a7..be01a4d627c9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Context tracking: Probe on high level context boundaries such as kernel * and userspace. This includes syscalls and exceptions entry/exit. diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index b64e238b553b..9c23ae074b40 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6f7619c1f877..13f0cb080a4d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * diff --git a/kernel/exit.c b/kernel/exit.c index 8361a560cd1d..1803efb2922f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..b2b87d450b80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * diff --git a/kernel/freezer.c b/kernel/freezer.c index b162b74611e4..c0738424bb43 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f108a95882c6..14a625c16cb3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Detect Hung Task * diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 73288914ed5e..d42acaf81886 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * diff --git a/kernel/jump_label.c b/kernel/jump_label.c index de6efdecc70d..0bfa10f4410c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * jump label support * diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 14934afa9e68..95a260f9214b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * diff --git a/kernel/kthread.c b/kernel/kthread.c index be4e8795561a..621467c33fef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d06190fa5082..c47788fa85f9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/lockdep.c * diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db578783dd36..0c601ae072b3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/locking/mutex.c * diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..b6a9cc62099a 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 978d63a8261c..38fbf9fa7f1b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RT-Mutexes: simple blocking mutual exclusion locks with PI support * diff --git a/kernel/notifier.c b/kernel/notifier.c index bfc95b3e4235..d9f5081d578d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/panic.c b/kernel/panic.c index b4543a31a495..4d9f55bf7d38 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/panic.c * diff --git a/kernel/pid.c b/kernel/pid.c index 89548d35eefb..e5cad0c7d5dd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..f54bc7cb6c2d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d22131afc1e..33e3febaba53 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This module exposes the interface to kernel space for specifying * QoS dependencies. It provides infrastructure for registration of: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a6e06fe38e41..1888f6a3b694 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * diff --git a/kernel/profile.c b/kernel/profile.c index 9c08a2c7cb1d..af7c94bf5fa1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6f357f4fc859..5710d07e67cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/ptrace.c * diff --git a/kernel/reboot.c b/kernel/reboot.c index b9e79e8c7226..c4d472b7f1b4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/reboot.c * diff --git a/kernel/resource.c b/kernel/resource.c index 8c15f846e8ef..158f04ec1d4f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/resource.c * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e3e3b979f9bd..1152259a4ca0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sched_clock() for unstable CPU clocks * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 102dfcf0a29a..874c427742a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/core.c * diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba4a143bdcf3..2305ce89a26c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple CPU accounting cgroup controller */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..80940939b733 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic entry points for the idle threads and * implementation of the idle task scheduling class. diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 687302051a27..123ea07a3f3b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Housekeeping management. Manage the targets for routine code that can run on * any CPU: unbound workqueues, timers, kthreads and any offloadable work. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6eb1f8efd221..fa0f9adfb752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic waiting primitives. * diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c67c6d24adc2..45eba18a2898 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/signal.c b/kernel/signal.c index a1eb44dc9ff5..d7b9d14ac80d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * diff --git a/kernel/smp.c b/kernel/smp.c index f4cf1b0bb3b8..d155374632eb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c230c2dd48e1..2efe1e206167 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Common SMP CPU bringup/teardown functions */ diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..5667f1da3ede 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/stacktrace.c * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 943c89178e3d..7d1008be6173 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface * diff --git a/kernel/umh.c b/kernel/umh.c index d937cbad903a..7f255b5a8845 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ diff --git a/kernel/up.c b/kernel/up.c index ff536f9cc8a2..483c9962c999 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Uniprocessor-only support functions. The counterpart to kernel/smp.c */ diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 9586b670a5b2..870ecd7c63ed 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/user.c b/kernel/user.c index 88b834f0eebc..78b17e36e705 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9657315405de..95aea04ff722 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * -- cgit v1.2.3 From ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 13:07:45 +0100 Subject: treewide: Add SPDX license identifier - Makefile/Kconfig Add SPDX license identifiers to all Make/Kconfig files which: - Have no license information of any form These files fall under the project license, GPL v2 only. The resulting SPDX license identifier is: GPL-2.0-only Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/Kconfig.freezer | 1 + kernel/Kconfig.hz | 1 + kernel/Kconfig.locks | 1 + kernel/Kconfig.preempt | 1 + kernel/debug/Makefile | 1 + kernel/dma/Kconfig | 1 + kernel/gcov/Kconfig | 1 + kernel/irq/Kconfig | 1 + kernel/livepatch/Kconfig | 1 + kernel/livepatch/Makefile | 1 + kernel/power/Kconfig | 1 + kernel/printk/Makefile | 1 + kernel/rcu/Kconfig | 1 + kernel/rcu/Kconfig.debug | 1 + kernel/time/Kconfig | 1 + kernel/trace/Kconfig | 1 + 16 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer index a3bb4cb52539..68646feefb3d 100644 --- a/kernel/Kconfig.freezer +++ b/kernel/Kconfig.freezer @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only config FREEZER def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846757..38ef6d06888e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer Interrupt Frequency Configuration # diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index bf770d7556f7..e0852dc333ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # The ARCH_INLINE foo is necessary because select ignores "depends on" # diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0fee5fe6c899..dc0b682ec2d9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only choice prompt "Preemption Model" diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile index a85edc339985..332ee6c6ec2c 100644 --- a/kernel/debug/Makefile +++ b/kernel/debug/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the linux kernel debugger # diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 83d711f8d665..70f8f8d9200e 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAS_DMA bool diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index f71c1adcff31..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "GCOV-based kernel profiling" config GCOV_KERNEL diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8fee06625c37..f92d9a687372 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "IRQ subsystem" # Options selectable by the architecture code diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index ec4565122e65..54102deb50ba 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAVE_LIVEPATCH bool help diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index b36ceda6488e..cf9b5bcdb952 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..ff8592ddedee 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4a2ffc39eb95..4d052fc6bcde 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 37301430970e..480edf328b51 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related configuration options # diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..5ec3ea4028e2 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related debugging configuration options # diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e2c038d6c13c..fcc42353f125 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer subsystem related configuration options # diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d965cef6c77..564e5fdb025f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Architectures that offer an FUNCTION_TRACER implementation should # select HAVE_FUNCTION_TRACER: -- cgit v1.2.3 From d6cd1e9b9ff4f5e2d6b7085ad8601f86177fd300 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:39 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 9 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not you can access it online at http www gnu org licenses gpl 2 0 html extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Steve Winslow Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154041.430943677@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/locking/test-ww_mutex.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 65a3b7e55b9f..3e82f449b4ff 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Module-based API test facility for ww_mutexes - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. */ #include -- cgit v1.2.3 From 1ccea77e2a2687cae171b7987eb44730ec8c6d5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:43 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 13 Based on 2 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not see http www gnu org licenses this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details [based] [from] [clk] [highbank] [c] you should have received a copy of the gnu general public license along with this program if not see http www gnu org licenses extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 355 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Steve Winslow Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154041.837383322@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/livepatch/core.c | 14 +------------- kernel/livepatch/patch.c | 14 +------------- kernel/livepatch/shadow.c | 14 +------------- kernel/livepatch/transition.c | 14 +------------- kernel/printk/internal.h | 14 +------------- kernel/printk/printk_safe.c | 14 +------------- 6 files changed, 6 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 91cd519756d3..2398832947c6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * core.c - Kernel Live Patching Core * * Copyright (C) 2014 Seth Jennings * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 99cb3ad05eb4..bd43537702bd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * patch.c - livepatch patching functions * * Copyright (C) 2014 Seth Jennings * Copyright (C) 2014 SUSE * Copyright (C) 2015 Josh Poimboeuf - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 83958c814439..e5c9fb295ba9 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * shadow.c - Shadow Variables * * Copyright (C) 2014 Josh Poimboeuf * Copyright (C) 2014 Seth Jennings * Copyright (C) 2017 Joe Lawrence - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ /** diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index c53370d596be..abb2a4a2cbb2 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * transition.c - Kernel Live Patching transition functions * * Copyright (C) 2015-2016 Josh Poimboeuf - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 0f1898820cba..c8e6ab689d42 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * internal.h - printk internal definitions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 0913b4d385de..b4045e782743 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include -- cgit v1.2.3 From 7170066ecd289cd8560695b6f86ba8dc723b6505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:55 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 25 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it would be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 6 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Steve Winslow Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154043.007767574@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/delayacct.c | 11 +---------- kernel/test_kprobes.c | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2a12b988c717..27725754ac99 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* delayacct.c - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #include diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 7bca480151b0..76c997fdbc9d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * test_kprobes.c - simple sanity test for *probes * * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define pr_fmt(fmt) "Kprobe smoke test: " fmt -- cgit v1.2.3 From 55267c88c003a3648567beae7c90512d3e2ab15e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:50 -0500 Subject: tracing: Prevent hist_field_var_ref() from accessing NULL tracing_map_elts hist_field_var_ref() is an implementation of hist_field_fn_t(), which can be called with a null tracing_map_elt elt param when assembling a key in event_hist_trigger(). In the case of hist_field_var_ref() this doesn't make sense, because a variable can only be resolved by looking it up using an already assembled key i.e. a variable can't be used to assemble a key since the key is required in order to access the variable. Upper layers should prevent the user from constructing a key using a variable in the first place, but in case one slips through, it shouldn't cause a NULL pointer dereference. Also if one does slip through, we want to know about it, so emit a one-time warning in that case. Link: http://lkml.kernel.org/r/64ec8dc15c14d305295b64cdfcc6b2b9dd14753f.1555597045.git.tom.zanussi@linux.intel.com Reported-by: Vincent Bernat Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7fca3457c705..06e7b9f66de6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1854,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field, struct hist_elt_data *elt_data; u64 var_val = 0; + if (WARN_ON_ONCE(!elt)) + return var_val; + elt_data = elt->private_data; var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; -- cgit v1.2.3 From c8d94a1878342fdffedaaf15201d951dfc147065 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:51 -0500 Subject: tracing: Check keys for variable references in expressions too There's an existing check for variable references in keys, but it doesn't go far enough. It checks whether a key field is a variable reference but doesn't check whether it's an expression containing variable references, which can cause the same problems for callers. Use the existing field_has_hist_vars() function rather than a direct top-level flag check to catch all possible variable references. Link: http://lkml.kernel.org/r/e8c3d3d53db5ca90ceea5a46e5413103a6902fc7.1555597045.git.tom.zanussi@linux.intel.com Cc: stable@vger.kernel.org Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reported-by: Vincent Bernat Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 06e7b9f66de6..2b76f9520bd0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -59,7 +59,7 @@ C(NO_CLOSING_PAREN, "No closing paren found"), \ C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ - C(INVALID_REF_KEY, "Using variable references as keys not supported"), \ + C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ C(VAR_NOT_FOUND, "Couldn't find variable"), \ C(FIELD_NOT_FOUND, "Couldn't find field"), @@ -4506,7 +4506,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + if (field_has_hist_vars(hist_field, 0)) { hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; -- cgit v1.2.3 From 9b2ca371b1505a547217b244f903ad3fb86fa5b4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:52 -0500 Subject: tracing: Add a check_val() check before updating cond_snapshot() track_val Without this check a snapshot is taken whenever a bucket's max is hit, rather than only when the global max is hit, as it should be. Before: In this example, we do a first run of the workload (cyclictest), examine the output, note the max ('triggering value') (347), then do a second run and note the max again. In this case, the max in the second run (39) is below the max in the first run, but since we haven't cleared the histogram, the first max is still in the histogram and is higher than any other max, so it should still be the max for the snapshot. It isn't however - the value should still be 347 after the second run. # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio,prev_comm):onmax($wakeup_lat).snapshot() if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2143 } hitcount: 199 max: 44 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2145 } hitcount: 1325 max: 38 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2144 } hitcount: 1982 max: 347 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 347 triggered by event with key: { next_pid: 2144 } # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2143 } hitcount: 199 max: 44 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2148 } hitcount: 199 max: 16 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 { next_pid: 2145 } hitcount: 1325 max: 38 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2150 } hitcount: 1326 max: 39 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2144 } hitcount: 1982 max: 347 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2149 } hitcount: 1983 max: 130 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/0 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 39 triggered by event with key: { next_pid: 2150 } After: In this example, we do a first run of the workload (cyclictest), examine the output, note the max ('triggering value') (375), then do a second run and note the max again. In this case, the max in the second run is still 375, the highest in any bucket, as it should be. # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2072 } hitcount: 200 max: 28 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 { next_pid: 2074 } hitcount: 1323 max: 375 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2073 } hitcount: 1980 max: 153 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 375 triggered by event with key: { next_pid: 2074 } # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2101 } hitcount: 199 max: 49 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2072 } hitcount: 200 max: 28 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 { next_pid: 2074 } hitcount: 1323 max: 375 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2103 } hitcount: 1325 max: 74 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2073 } hitcount: 1980 max: 153 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2102 } hitcount: 1981 max: 84 next_prio: 19 next_comm: cyclictest prev_pid: 12 prev_prio: 120 prev_comm: kworker/0:1 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 375 triggered by event with key: { next_pid: 2074 } Link: http://lkml.kernel.org/r/95958351329f129c07504b4d1769c47a97b70d65.1555597045.git.tom.zanussi@linux.intel.com Cc: stable@vger.kernel.org Fixes: a3785b7eca8fd ("tracing: Add hist trigger snapshot() action") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 2b76f9520bd0..ca6b0dff60c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3585,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) struct track_data *track_data = tr->cond_snapshot->cond_data; struct hist_elt_data *elt_data, *track_elt_data; struct snapshot_context *context = cond_data; + struct action_data *action; u64 track_val; if (!track_data) return false; + action = track_data->action_data; + track_val = get_track_val(track_data->hist_data, context->elt, track_data->action_data); + if (!action->track_data.check_val(track_data->track_val, track_val)) + return false; + track_data->track_val = track_val; memcpy(track_data->key, context->key, track_data->key_len); -- cgit v1.2.3 From 4eebe38a37f9397ffecd4bd3afbdf36838a97969 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Thu, 28 Mar 2019 03:49:46 +0530 Subject: kernel/trace/trace.h: Remove duplicate header of trace_seq.h Remove duplicate header which is included twice. Link: http://lkml.kernel.org/r/1553725186-41442-1-git-send-email-jagdsh.linux@gmail.com Reviewed-by: Mukesh Ojha Signed-off-by: Jagadeesh Pagadala Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1974ce818ddb..82c70b63d375 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #ifdef CONFIG_FTRACE_SYSCALLS -- cgit v1.2.3 From 1b038c6e05ff70a1e66e3e571c2e6106bdb75f53 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Fri, 17 May 2019 13:52:31 +0200 Subject: perf/ring_buffer: Fix exposing a temporarily decreased data_head In perf_output_put_handle(), an IRQ/NMI can happen in below location and write records to the same ring buffer: ... local_dec_and_test(&rb->nest) ... <-- an IRQ/NMI can happen here rb->user_page->data_head = head; ... In this case, a value A is written to data_head in the IRQ, then a value B is written to data_head after the IRQ. And A > B. As a result, data_head is temporarily decreased from A to B. And a reader may see data_head < data_tail if it read the buffer frequently enough, which creates unexpected behaviors. This can be fixed by moving dec(&rb->nest) to after updating data_head, which prevents the IRQ/NMI above from updating data_head. [ Split up by peterz. ] Signed-off-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mark.rutland@arm.com Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.224478157@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 674b35383491..009467a60578 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -51,11 +51,18 @@ again: head = local_read(&rb->head); /* - * IRQ/NMI can happen here, which means we can miss a head update. + * IRQ/NMI can happen here and advance @rb->head, causing our + * load above to be stale. */ - if (!local_dec_and_test(&rb->nest)) + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + if (local_read(&rb->nest) > 1) { + local_dec(&rb->nest); goto out; + } /* * Since the mmap() consumer (userspace) can run on a different CPU: @@ -87,9 +94,18 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * We must publish the head before decrementing the nest count, + * otherwise an IRQ/NMI can publish a more recent head value and our + * write will (temporarily) publish a stale value. + */ + barrier(); + local_set(&rb->nest, 0); + + /* + * Ensure we decrement @rb->nest before we validate the @rb->head. + * Otherwise we cannot be sure we caught the 'last' nested update. */ + barrier(); if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); goto again; -- cgit v1.2.3 From 3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:32 +0200 Subject: perf/ring_buffer: Add ordering to rb->nest increment Similar to how decrementing rb->next too early can cause data_head to (temporarily) be observed to go backward, so too can this happen when we increment too late. This barrier() ensures the rb->head load happens after the increment, both the one in the 'goto again' path, as the one from perf_output_get_handle() -- albeit very unlikely to matter for the latter. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.309516009@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 009467a60578..4b5f8d932400 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle *handle) unsigned long head; again: + /* + * In order to avoid publishing a head value that goes backwards, + * we must ensure the load of @rb->head happens after we've + * incremented @rb->nest. + * + * Otherwise we can observe a @rb->head value before one published + * by an IRQ/NMI happening between the load and the increment. + */ + barrier(); head = local_read(&rb->head); /* -- cgit v1.2.3 From 4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:33 +0200 Subject: perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data We must use {READ,WRITE}_ONCE() on rb->user_page data such that concurrent usage will see whole values. A few key sites were missing this. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1") Link: http://lkml.kernel.org/r/20190517115418.394192145@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4b5f8d932400..7a0c73e4b3eb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -100,7 +100,7 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, @@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; @@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; -- cgit v1.2.3 From 5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:34 +0200 Subject: perf/ring-buffer: Use regular variables for nesting While the IRQ/NMI will nest, the nest-count will be invariant over the actual exception, since it will decrement equal to increment. This means we can -- carefully -- use a regular variable since the typical LOAD-STORE race doesn't exist (similar to preempt_count). This optimizes the ring-buffer for all LOAD-STORE architectures, since they need to use atomic ops to implement local_t. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Cc: yabinc@google.com Link: http://lkml.kernel.org/r/20190517115418.481392777@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -24,7 +24,7 @@ struct ring_buffer { atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ - local_t nest; /* nested writers */ + unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ @@ -41,7 +41,7 @@ struct ring_buffer { /* AUX area */ long aux_head; - local_t aux_nest; + unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7a0c73e4b3eb..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) struct ring_buffer *rb = handle->rb; preempt_disable(); - local_inc(&rb->nest); + + /* + * Avoid an explicit LOAD/STORE such that architectures with memops + * can use them. + */ + (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } @@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; + unsigned int nest; + + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + nest = READ_ONCE(rb->nest); + if (nest > 1) { + WRITE_ONCE(rb->nest, nest - 1); + goto out; + } again: /* @@ -64,15 +80,6 @@ again: * load above to be stale. */ - /* - * If this isn't the outermost nesting, we don't have to update - * @rb->user_page->data_head. - */ - if (local_read(&rb->nest) > 1) { - local_dec(&rb->nest); - goto out; - } - /* * Since the mmap() consumer (userspace) can run on a different CPU: * @@ -108,7 +115,7 @@ again: * write will (temporarily) publish a stale value. */ barrier(); - local_set(&rb->nest, 0); + WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. @@ -116,7 +123,7 @@ again: */ barrier(); if (unlikely(head != local_read(&rb->head))) { - local_inc(&rb->nest); + WRITE_ONCE(rb->nest, 1); goto again; } @@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; + unsigned int nest; if (output_event->parent) output_event = output_event->parent; @@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; + nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ - if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + if (WARN_ON_ONCE(nest)) goto err_put; + WRITE_ONCE(rb->aux_nest, nest + 1); + aux_head = rb->aux_head; handle->rb = rb; @@ -419,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!handle->size) { /* A, matches D */ event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } @@ -508,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->event = NULL; - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); -- cgit v1.2.3 From b4d0d230ccfb5d1a9ea85da64aa584df7c148ee9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 May 2019 19:08:01 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 36 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public licence as published by the free software foundation either version 2 of the licence or at your option any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 114 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190520170857.552531963@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/cred.c | 6 +----- kernel/module-internal.h | 6 +----- kernel/module_signing.c | 6 +----- 3 files changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..e74ffdc98a92 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include #include diff --git a/kernel/module-internal.h b/kernel/module-internal.h index d354341f8cc0..33783abc377b 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* Module internals * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b9a926fd86b..b10fb1986ca9 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Module signature checker * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include -- cgit v1.2.3 From 6ff3f917e06625f9612f0dbcda10bef45b099b00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 May 2019 19:08:03 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 38 Based on 1 normalized pattern(s): this file is released under the gplv2 and any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190520170857.732920462@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/stop_machine.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 7231fb5953fc..2b5a6754646f 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/stop_machine.c * @@ -5,8 +6,6 @@ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo - * - * This file is released under the GPLv2 and any later version. */ #include #include -- cgit v1.2.3 From f7b101d33046a837c2aa4526cef28a3c785d7af2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 15 May 2019 17:35:51 -0400 Subject: kheaders: Move from proc to sysfs The kheaders archive consisting of the kernel headers used for compiling bpf programs is in /proc. However there is concern that moving it here will make it permanent. Let us move it to /sys/kernel as discussed [1]. [1] https://lore.kernel.org/patchwork/patch/1067310/#1265969 Suggested-by: Steven Rostedt Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/Makefile | 4 +-- kernel/gen_ikh_data.sh | 89 -------------------------------------------------- kernel/gen_kheaders.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kheaders.c | 40 +++++++++-------------- 4 files changed, 107 insertions(+), 115 deletions(-) delete mode 100755 kernel/gen_ikh_data.sh create mode 100755 kernel/gen_kheaders.sh (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 33824f0385b3..a8d923b5481b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o +obj-$(CONFIG_IKHEADERS) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh deleted file mode 100755 index 591a94f7b387..000000000000 --- a/kernel/gen_ikh_data.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# This script generates an archive consisting of kernel headers -# for CONFIG_IKHEADERS_PROC. -set -e -spath="$(dirname "$(readlink -f "$0")")" -kroot="$spath/.." -outdir="$(pwd)" -tarfile=$1 -cpio_dir=$outdir/$tarfile.tmp - -# Script filename relative to the kernel source root -# We add it to the archive because it is small and any changes -# to this script will also cause a rebuild of the archive. -sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" - -src_file_list=" -include/ -arch/$SRCARCH/include/ -$sfile -" - -obj_file_list=" -include/ -arch/$SRCARCH/include/ -" - -# Support incremental builds by skipping archive generation -# if timestamps of files being archived are not changed. - -# This block is useful for debugging the incremental builds. -# Uncomment it for debugging. -# iter=1 -# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; -# else; iter=$(($(cat /tmp/iter) + 1)); fi -# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter -# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter - -# include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. This causes pointless regeneration, so let us -# ignore them for md5 calculation. -pushd $kroot > /dev/null -src_files_md5="$(find $src_file_list -type f | - grep -v "include/generated/compile.h" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" -popd > /dev/null -obj_files_md5="$(find $obj_file_list -type f | - grep -v "include/generated/compile.h" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" - -if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi -if [ -f kernel/kheaders.md5 ] && - [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && - [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then - exit -fi - -if [ "${quiet}" != "silent_" ]; then - echo " GEN $tarfile" -fi - -rm -rf $cpio_dir -mkdir $cpio_dir - -pushd $kroot > /dev/null -for f in $src_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; -done | cpio --quiet -pd $cpio_dir -popd > /dev/null - -# The second CPIO can complain if files already exist which can -# happen with out of tree builds. Just silence CPIO for now. -for f in $obj_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 - -# Remove comments except SDPX lines -find $cpio_dir -type f -print0 | - xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' - -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null - -echo "$src_files_md5" > kernel/kheaders.md5 -echo "$obj_files_md5" >> kernel/kheaders.md5 -echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 - -rm -rf $cpio_dir diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh new file mode 100755 index 000000000000..581b83534587 --- /dev/null +++ b/kernel/gen_kheaders.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 70ae6052920d..8f69772af77b 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c @@ -8,9 +8,8 @@ #include #include -#include +#include #include -#include /* * Define kernel_headers_data and kernel_headers_data_end, within which the @@ -31,39 +30,32 @@ extern char kernel_headers_data; extern char kernel_headers_data_end; static ssize_t -ikheaders_read_current(struct file *file, char __user *buf, - size_t len, loff_t *offset) +ikheaders_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) { - return simple_read_from_buffer(buf, len, offset, - &kernel_headers_data, - &kernel_headers_data_end - - &kernel_headers_data); + memcpy(buf, &kernel_headers_data + off, len); + return len; } -static const struct file_operations ikheaders_file_ops = { - .read = ikheaders_read_current, - .llseek = default_llseek, +static struct bin_attribute kheaders_attr __ro_after_init = { + .attr = { + .name = "kheaders.tar.xz", + .mode = 0444, + }, + .read = &ikheaders_read, }; static int __init ikheaders_init(void) { - struct proc_dir_entry *entry; - - /* create the current headers file */ - entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, - &ikheaders_file_ops); - if (!entry) - return -ENOMEM; - - proc_set_size(entry, - &kernel_headers_data_end - - &kernel_headers_data); - return 0; + kheaders_attr.size = (&kernel_headers_data_end - + &kernel_headers_data); + return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); } static void __exit ikheaders_cleanup(void) { - remove_proc_entry("kheaders.tar.xz", NULL); + sysfs_remove_bin_file(kernel_kobj, &kheaders_attr); } module_init(ikheaders_init); -- cgit v1.2.3 From 1457dc9ed8da871fbbc0a2ebdaed0405eeeed0cf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 15 May 2019 17:35:52 -0400 Subject: kheaders: Do not regenerate archive if config is not changed Linus reported an issue that doing an allmodconfig was causing the kheaders archive to be regenerated even though the config is the same. This patch fixes the issue by ignoring the config-related header files for "knowing when to regenerate based on timestamps". Instead, if the CONFIG_X_Y option really changes, then we there are the include/config/X/Y.h which will already tells us "if a config really changed". So we don't really need these files for regeneration detection anyway, and ignoring them fixes Linus's issue. Reported-by: Linus Torvalds Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/gen_kheaders.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 581b83534587..9a34e1d9bd7f 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -31,9 +31,8 @@ arch/$SRCARCH/include/ # This block is useful for debugging the incremental builds. # Uncomment it for debugging. -# iter=1 -# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; -# else; iter=$(($(cat /tmp/iter) + 1)); fi +# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; +# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi # find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter # find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter @@ -43,10 +42,18 @@ arch/$SRCARCH/include/ pushd $kroot > /dev/null src_files_md5="$(find $src_file_list -type f | grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + grep -v "include/config/auto.conf" | + grep -v "include/config/auto.conf.cmd" | + grep -v "include/config/tristate.conf" | xargs ls -lR | md5sum | cut -d ' ' -f1)" popd > /dev/null obj_files_md5="$(find $obj_file_list -type f | grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + grep -v "include/config/auto.conf" | + grep -v "include/config/auto.conf.cmd" | + grep -v "include/config/tristate.conf" | xargs ls -lR | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi @@ -82,7 +89,7 @@ find $cpio_dir -type f -print0 | tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null -echo "$src_files_md5" > kernel/kheaders.md5 +echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -- cgit v1.2.3 From 51816e9e113934281b44f1a352852ef7631e75ea Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 24 May 2019 15:42:22 -0400 Subject: locking/lock_events: Use this_cpu_add() when necessary The kernel test robot has reported that the use of __this_cpu_add() causes bug messages like: BUG: using __this_cpu_add() in preemptible [00000000] code: ... Given the imprecise nature of the count and the possibility of resetting the count and doing the measurement again, this is not really a big problem to use the unprotected __this_cpu_*() functions. To make the preemption checking code happy, the this_cpu_*() functions will be used if CONFIG_DEBUG_PREEMPT is defined. The imprecise nature of the locking counts are also documented with the suggestion that we should run the measurement a few times with the counts reset in between to get a better picture of what is going on under the hood. Fixes: a8654596f0371 ("locking/rwsem: Enable lock event counting") Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- kernel/locking/lock_events.h | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index feb1acc54611..46b71af8eef2 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -30,13 +30,51 @@ enum lock_events { */ DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); +/* + * The purpose of the lock event counting subsystem is to provide a low + * overhead way to record the number of specific locking events by using + * percpu counters. It is the percpu sum that matters, not specifically + * how many of them happens in each cpu. + * + * It is possible that the same percpu counter may be modified in both + * the process and interrupt contexts. For architectures that perform + * percpu operation with multiple instructions, it is possible to lose + * count if a process context percpu update is interrupted in the middle + * and the same counter is updated in the interrupt context. Therefore, + * the generated percpu sum may not be precise. The error, if any, should + * be small and insignificant. + * + * For those architectures that do multi-instruction percpu operation, + * preemption in the middle and moving the task to another cpu may cause + * a larger error in the count. Again, this will be few and far between. + * Given the imprecise nature of the count and the possibility of resetting + * the count and doing the measurement again, this is not really a big + * problem. + * + * To get a better picture of what is happening under the hood, it is + * suggested that a few measurements should be taken with the counts + * reset in between to stamp out outliner because of these possible + * error conditions. + * + * To minimize overhead, we use __this_cpu_*() in all cases except when + * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() + * will be used to avoid the appearance of unwanted BUG messages. + */ +#ifdef CONFIG_DEBUG_PREEMPT +#define lockevent_percpu_inc(x) this_cpu_inc(x) +#define lockevent_percpu_add(x, v) this_cpu_add(x, v) +#else +#define lockevent_percpu_inc(x) __this_cpu_inc(x) +#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) +#endif + /* * Increment the PV qspinlock statistical counters */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - __this_cpu_inc(lockevents[event]); + lockevent_percpu_inc(lockevents[event]); } #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) @@ -44,7 +82,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - __this_cpu_add(lockevents[event], inc); + lockevent_percpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -- cgit v1.2.3 From 0c97bf863efce63d6ab7971dad811601e6171d2f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 23 May 2019 14:45:35 +0200 Subject: tracing: Silence GCC 9 array bounds warning Starting with GCC 9, -Warray-bounds detects cases when memset is called starting on a member of a struct but the size to be cleared ends up writing over further members. Such a call happens in the trace code to clear, at once, all members after and including `seq` on struct trace_iterator: In function 'memset', inlined from 'ftrace_dump' at kernel/trace/trace.c:8914:3: ./include/linux/string.h:344:9: warning: '__builtin_memset' offset [8505, 8560] from the object at 'iter' is out of the bounds of referenced subobject 'seq' with type 'struct trace_seq' at offset 4368 [-Warray-bounds] 344 | return __builtin_memset(p, c, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ In order to avoid GCC complaining about it, we compute the address ourselves by adding the offsetof distance instead of referring directly to the member. Since there are two places doing this clear (trace.c and trace_kdb.c), take the chance to move the workaround into a single place in the internal header. Link: http://lkml.kernel.org/r/20190523124535.GA12931@gmail.com Signed-off-by: Miguel Ojeda [ Removed unnecessary parenthesis around "iter" ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +----- kernel/trace/trace.h | 18 ++++++++++++++++++ kernel/trace/trace_kdb.c | 6 +----- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c92b3d9ea30..1c80521fd436 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8910,12 +8910,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 82c70b63d375..005f08629b8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1966,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } extern struct trace_iterator *tracepoint_print_iter; +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ + const size_t offset = offsetof(struct trace_iterator, seq); + + /* + * Keep gcc from complaining about overwriting more than just one + * member in the structure. + */ + memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); + + iter->pos = -1; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 6c1ae6b752d1..cca65044c14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -37,12 +37,8 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (skip_entries) kdb_printf("(skipping %d entries)\n", skip_entries); - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { -- cgit v1.2.3 From bb1869012d7b78d1474808cb4c8bd8b272645876 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 16 May 2019 12:43:19 +0200 Subject: ACPI: PM: Call pm_set_suspend_via_firmware() during hibernation On systems with ACPI platform firmware the last stage of hibernation is analogous to system suspend to S3 (suspend-to-RAM), so it should be handled analogously. In particular, pm_suspend_via_firmware() should return 'true' in that stage to let the callers of it know that control will be passed to the platform firmware going forward, so pm_set_suspend_via_firmware() needs to be called then in analogy with acpi_suspend_begin(). However, the platform hibernation ->begin() callback is invoked during the "freeze" transition (before creating a snapshot image of system memory) as well as during the "hibernate" transition which is the last stage of it and pm_set_suspend_via_firmware() should be invoked by that callback in the latter stage only. In order to implement that redefine the hibernation ->begin() callback to take a pm_message_t argument to indicate which stage of hibernation is taking place and rework acpi_hibernation_begin() and acpi_hibernation_begin_old() to take it into account as needed. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..97522630b1b6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -129,7 +129,7 @@ static int hibernation_test(int level) { return 0; } static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; + hibernation_ops->begin(PMSG_FREEZE) : 0; } /** @@ -542,7 +542,7 @@ int hibernation_platform_enter(void) * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ - error = hibernation_ops->begin(); + error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; -- cgit v1.2.3 From dfb4a6f2191a80c8b790117d0ff592fd712d3296 Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Tue, 28 May 2019 17:43:38 +0200 Subject: tracing: Avoid memory leak in predicate_parse() In case of errors, predicate_parse() goes to the out_free label to free memory and to return an error code. However, predicate_parse() does not free the predicates of the temporary prog_stack array, thence leaking them. Link: http://lkml.kernel.org/r/20190528154338.29976-1-tomasbortoli@gmail.com Cc: stable@vger.kernel.org Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Reported-by: syzbot+6b8e0fb820e570c59e19@syzkaller.appspotmail.com Signed-off-by: Tomas Bortoli [ Added protection around freeing prog_stack[i].pred ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d3e59312ef40..5079d1db3754 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -428,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); + prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -579,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, out_free: kfree(op_stack); kfree(inverts); - kfree(prog_stack); + if (prog_stack) { + for (i = 0; prog_stack[i].pred; i++) + kfree(prog_stack[i].pred); + kfree(prog_stack); + } return ERR_PTR(ret); } -- cgit v1.2.3 From f6e2aa91a46d2bc79fce9b93a988dbe7655c90c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 May 2019 18:46:37 -0500 Subject: signal/ptrace: Don't leak unitialized kernel memory with PTRACE_PEEK_SIGINFO Recently syzbot in conjunction with KMSAN reported that ptrace_peek_siginfo can copy an uninitialized siginfo to userspace. Inspecting ptrace_peek_siginfo confirms this. The problem is that off when initialized from args.off can be initialized to a negaive value. At which point the "if (off >= 0)" test to see if off became negative fails because off started off negative. Prevent the core problem by adding a variable found that is only true if a siginfo is found and copied to a temporary in preparation for being copied to userspace. Prevent args.off from being truncated when being assigned to off by testing that off is <= the maximum possible value of off. Convert off to an unsigned long so that we should not have to truncate args.off, we have well defined overflow behavior so if we add another check we won't risk fighting undefined compiler behavior, and so that we have a type whose maximum value is easy to test for. Cc: Andrei Vagin Cc: stable@vger.kernel.org Reported-by: syzbot+0d602a1b0d8c95bdf299@syzkaller.appspotmail.com Fixes: 84c751bd4aeb ("ptrace: add ability to retrieve signals without removing from a queue (v4)") Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6f357f4fc859..02c6528ead5c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -704,6 +704,10 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (arg.nr < 0) return -EINVAL; + /* Ensure arg.off fits in an unsigned long */ + if (arg.off > ULONG_MAX) + return 0; + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) pending = &child->signal->shared_pending; else @@ -711,18 +715,20 @@ static int ptrace_peek_siginfo(struct task_struct *child, for (i = 0; i < arg.nr; ) { kernel_siginfo_t info; - s32 off = arg.off + i; + unsigned long off = arg.off + i; + bool found = false; spin_lock_irq(&child->sighand->siglock); list_for_each_entry(q, &pending->list, list) { if (!off--) { + found = true; copy_siginfo(&info, &q->info); break; } } spin_unlock_irq(&child->sighand->siglock); - if (off >= 0) /* beyond the end of the list */ + if (!found) /* beyond the end of the list */ break; #ifdef CONFIG_COMPAT -- cgit v1.2.3 From 2874c5fd284268364ece81a7bd936f3c8168e567 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:01 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 3029 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070032.746973796@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643af82f..7c473f208a10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * @@ -12,11 +13,6 @@ * Alexei Starovoitov * Daniel Borkmann * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ -- cgit v1.2.3 From 1a59d1b8e05ea6ab45f7e18897de1ef0e6bc3da6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:05 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not write to the free software foundation inc 59 temple place suite 330 boston ma 02111 1307 usa extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1334 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070033.113240726@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/audit.c | 15 +-------------- kernel/audit.h | 15 +-------------- kernel/audit_watch.c | 15 +-------------- kernel/auditfilter.c | 15 +-------------- kernel/extable.c | 14 +------------- kernel/futex.c | 15 +-------------- kernel/kprobes.c | 15 +-------------- kernel/module.c | 14 +------------- kernel/params.c | 14 +------------- kernel/tracepoint.c | 15 +-------------- 10 files changed, 10 insertions(+), 137 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b96bf69183f4..486c968214d9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c @@ -5,20 +6,6 @@ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Written by Rickard E. (Rik) Faith * * Goals: 1) Integrate fully with Security Modules. diff --git a/kernel/audit.h b/kernel/audit.h index 2071725a999f..6c076d4982da 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -1,22 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit -- definition of audit_context structure and supporting types * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index b50c574223fa..1f31c2f1e6fc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_watch.c -- watching inodes * * Copyright 2003-2009 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 303fb04770ce..9f8e190e3bea 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* auditfilter.c -- filtering of audit events * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/extable.c b/kernel/extable.c index 6a5b61ebc66c..e23cce6e6092 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Rewritten by Rusty Russell, on the backs of many others... Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/futex.c b/kernel/futex.c index 2268b97d5439..4b5b468c58b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 @@ -29,20 +30,6 @@ * * "The futexes are also cursed." * "But they come in a choice of three flavours!" - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b1ea30a5540e..445337c107e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) * kernel/kprobes.c * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S Kernel diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..80c7c09584cf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (C) 2002 Richard Henderson Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..cf448785d058 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Helpers for initial module or kernel cmdline parsing Copyright (C) 2001 Rusty Russell. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..df3ade14ccbd 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include -- cgit v1.2.3 From c942fddf8793b2013be8c901b47d0a8dc02bf99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:06 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 157 Based on 3 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version [author] [kishon] [vijay] [abraham] [i] [kishon]@[ti] [com] this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version [author] [graeme] [gregory] [gg]@[slimlogic] [co] [uk] [author] [kishon] [vijay] [abraham] [i] [kishon]@[ti] [com] [based] [on] [twl6030]_[usb] [c] [author] [hema] [hk] [hemahk]@[ti] [com] this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1105 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070033.202006027@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/audit_fsnotify.c | 11 +---------- kernel/locking/qrwlock.c | 11 +---------- kernel/locking/qspinlock.c | 11 +---------- kernel/locking/qspinlock_stat.h | 10 +--------- kernel/sched/membarrier.c | 11 +---------- kernel/taskstats.c | 12 +----------- kernel/tsacct.c | 13 +------------ 7 files changed, 7 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index b5737b826951..f0d243318452 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_fsnotify.c -- tracking inodes * * Copyright 2003-2009,2014-2015 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index c7471c3fb798..fe9ca92faa2a 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued read/write locks * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. * * Authors: Waiman Long diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index e14b32c69639..2473f10c6956 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued spinlock * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 54152670ff24..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -1,13 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Authors: Waiman Long */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3cd8a3a795d2..aa8d75804108 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2010-2017 Mathieu Desnoyers * * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include "sched.h" diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5f852b8f59f7..13a0f2e6ebc2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -1,19 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * taskstats.c - Export per-task statistics to userland * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..7be3e7530841 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include -- cgit v1.2.3 From 468e15fdc2ec7048ab1ae93e200559151c84647e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:17 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 170 Based on 1 normalized pattern(s): this file is release under the gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Richard Fontana Reviewed-by: Kate Stewart Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070034.216732358@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 46ba853656f6..35859da8bd4f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers - * - * This file is release under the GPLv2 - * */ #include -- cgit v1.2.3 From 25763b3c864cf517d686661012d184ee47a49b4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2019 10:10:09 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 206 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of version 2 of the gnu general public license as published by the free software foundation extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 107 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Reviewed-by: Steve Winslow Reviewed-by: Alexios Zavras Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190528171438.615055994@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/bpf_lru_list.c | 5 +---- kernel/bpf/bpf_lru_list.h | 5 +---- kernel/bpf/map_in_map.c | 5 +---- kernel/bpf/map_in_map.h | 5 +---- kernel/bpf/percpu_freelist.c | 5 +---- kernel/bpf/percpu_freelist.h | 5 +---- kernel/bpf/stackmap.c | 5 +---- 7 files changed, 7 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e6ef4401a138..1b6b9349cb85 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 7d4f89b7cb84..f02504640e18 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __MAP_IN_MAP_H__ #define __MAP_IN_MAP_H__ diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include "percpu_freelist.h" diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __PERCPU_FREELIST_H__ #define __PERCPU_FREELIST_H__ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..d38e49f943a1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include -- cgit v1.2.3 From 6b115bf58e6f013ca75e7115aabcbd56c20ff31d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:57 -0700 Subject: cgroup: Call cgroup_release() before __exit_signal() cgroup_release() calls cgroup_subsys->release() which is used by the pids controller to uncharge its pid. We want to use it to manage iteration of dying tasks which requires putting it before __unhash_process(). Move cgroup_release() above __exit_signal(). While this makes it uncharge before the pid is freed, pid is RCU freed anyway and the window is very narrow. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1803efb2922f..a75b6a7f458a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -195,6 +195,7 @@ repeat: rcu_read_unlock(); proc_flush_task(p); + cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -220,7 +221,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); -- cgit v1.2.3 From b636fd38dc40113f853337a7d2a6885ad23b8811 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Implement css_task_iter_skip() When a task is moved out of a cset, task iterators pointing to the task are advanced using the normal css_task_iter_advance() call. This is fine but we'll be tracking dying tasks on csets and thus moving tasks from cset->tasks to (to be added) cset->dying_tasks. When we remove a task from cset->tasks, if we advance the iterators, they may move over to the next cset before we had the chance to add the task back on the dying list, which can allow the task to escape iteration. This patch separates out skipping from advancing. Skipping only moves the affected iterators to the next pointer rather than fully advancing it and the following advancing will recognize that the cursor has already been moved forward and do the rest of advancing. This ensures that when a task moves from one list to another in its cset, as long as it moves in the right direction, it's always visible to iteration. This doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/cgroup/cgroup.c | 60 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..035aee466bbf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -843,6 +844,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -868,22 +884,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -4430,10 +4433,19 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_advance(struct css_task_iter *it) +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) { - struct list_head *next; + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} +static void css_task_iter_advance(struct css_task_iter *it) +{ lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4442,15 +4454,15 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); -- cgit v1.2.3 From c03cd7738a83b13739f00546166969342c8ff014 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Include dying leaders with live threads in PROCS iterations CSS_TASK_ITER_PROCS currently iterates live group leaders; however, this means that a process with dying leader and live threads will be skipped. IOW, cgroup.procs might be empty while cgroup.threads isn't, which is confusing to say the least. Fix it by making cset track dying tasks and include dying leaders with live threads in PROCS iteration. Signed-off-by: Tejun Heo Reported-and-tested-by: Topi Miettinen Cc: Oleg Nesterov --- kernel/cgroup/cgroup.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 035aee466bbf..a7df319c2e9a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -739,6 +739,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -1213,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -4399,15 +4401,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4446,6 +4451,8 @@ static void css_task_iter_skip(struct css_task_iter *it, static void css_task_iter_advance(struct css_task_iter *it) { + struct task_struct *task; + lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4462,17 +4469,32 @@ repeat: if (it->task_pos == it->tasks_head) it->task_pos = it->mg_tasks_head->next; if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -6009,6 +6031,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; WARN_ON_ONCE(cgroup_task_frozen(tsk)); @@ -6034,6 +6057,13 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); + + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) -- cgit v1.2.3 From 8856ae4df3e9b5295ea2da7ad3b00796386454ec Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 31 May 2019 22:30:12 -0700 Subject: kernel/fork.c: make max_threads symbol static Fix build warning, kernel/fork.c:125:5: warning: symbol 'max_threads' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20190516015118.140561-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reported-by: Hulk Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b2b87d450b80..75675b9bf6df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -123,7 +123,7 @@ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ -int max_threads; /* tunable limit on nr_threads */ +static int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; -- cgit v1.2.3 From 11bbd8b416f8abf40900dc5041152892f873d915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 31 May 2019 22:30:16 -0700 Subject: prctl_set_mm: refactor checks from validate_prctl_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite comment of validate_prctl_map claims there are no capability checks, it is not completely true since commit 4d28df6152aa ("prctl: Allow local CAP_SYS_ADMIN changing exe_file"). Extract the check out of the function and make the function perform purely arithmetic checks. This patch should not change any behavior, it is mere refactoring for following patch. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190502125203.24014-2-mkoutny@suse.com Signed-off-by: Michal Koutný Reviewed-by: Kirill Tkhai Reviewed-by: Cyrill Gorcunov Cc: Kirill Tkhai Cc: Laurent Dufour Cc: Mateusz Guzik Cc: Michal Hocko Cc: Yang Shi Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index bdbfe8d37418..775bf8d18d03 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1882,13 +1882,14 @@ exit_err: } /* + * Check arithmetic relations of passed addresses. + * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ -static int validate_prctl_map(struct prctl_mm_map *prctl_map) +static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; - struct mm_struct *mm = current->mm; int error = -EINVAL, i; static const unsigned char offsets[] = { @@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) prctl_map->start_data)) goto out; - /* - * Someone is trying to cheat the auxv vector. - */ - if (prctl_map->auxv_size) { - if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) - goto out; - } - - /* - * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. - */ - if (prctl_map->exe_fd != (u32)-1) { - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) - goto out; - } - error = 0; out: return error; @@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { + /* + * Someone is trying to cheat the auxv vector. + */ + if (!prctl_map.auxv || + prctl_map.auxv_size > sizeof(mm->saved_auxv)) + return -EINVAL; + memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, @@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } if (prctl_map.exe_fd != (u32)-1) { + /* + * Make sure the caller has the rights to + * change /proc/pid/exe link: only local sys admin should + * be allowed to. + */ + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EINVAL; + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; @@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; - struct prctl_mm_map prctl_map; + struct prctl_mm_map prctl_map = { + .auxv = NULL, + .auxv_size = 0, + .exe_fd = -1, + }; struct vm_area_struct *vma; int error; @@ -2139,9 +2141,6 @@ static int prctl_set_mm(int opt, unsigned long addr, prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; - prctl_map.auxv = NULL; - prctl_map.auxv_size = 0; - prctl_map.exe_fd = -1; switch (opt) { case PR_SET_MM_START_CODE: @@ -2181,7 +2180,7 @@ static int prctl_set_mm(int opt, unsigned long addr, goto out; } - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) goto out; -- cgit v1.2.3 From bc81426f5beef7da863d3365bc9d45e820448745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 31 May 2019 22:30:19 -0700 Subject: prctl_set_mm: downgrade mmap_sem to read lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit a3b609ef9f8b ("proc read mm's {arg,env}_{start,end} with mmap semaphore taken.") added synchronization of reading argument/environment boundaries under mmap_sem. Later commit 88aa7cc688d4 ("mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct") avoided the coarse use of mmap_sem in similar situations. But there still remained two places that (mis)use mmap_sem. get_cmdline should also use arg_lock instead of mmap_sem when it reads the boundaries. The second place that should use arg_lock is in prctl_set_mm. By protecting the boundaries fields with the arg_lock, we can downgrade mmap_sem to reader lock (analogous to what we already do in prctl_set_mm_map). [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190502125203.24014-3-mkoutny@suse.com Fixes: 88aa7cc688d4 ("mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct") Signed-off-by: Michal Koutný Signed-off-by: Laurent Dufour Co-developed-by: Laurent Dufour Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Yang Shi Cc: Mateusz Guzik Cc: Kirill Tkhai Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 775bf8d18d03..2969304c29fe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates of arg boundaries, we need + * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * validation. + */ + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); + spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; @@ -2217,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_write(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); + up_read(&mm->mmap_sem); return error; } -- cgit v1.2.3 From 9852ae3fe5293264f01c49f2571ef7688f7823ce Mon Sep 17 00:00:00 2001 From: Chris Down Date: Fri, 31 May 2019 22:30:22 -0700 Subject: mm, memcg: consider subtrees in memory.events memory.stat and other files already consider subtrees in their output, and we should too in order to not present an inconsistent interface. The current situation is fairly confusing, because people interacting with cgroups expect hierarchical behaviour in the vein of memory.stat, cgroup.events, and other files. For example, this causes confusion when debugging reclaim events under low, as currently these always read "0" at non-leaf memcg nodes, which frequently causes people to misdiagnose breach behaviour. The same confusion applies to other counters in this file when debugging issues. Aggregation is done at write time instead of at read-time since these counters aren't hot (unlike memory.stat which is per-page, so it does it at read time), and it makes sense to bundle this with the file notifications. After this patch, events are propagated up the hierarchy: [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 0 oom 0 oom_kill 0 [root@ktst ~]# systemd-run -p MemoryMax=1 true Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 7 oom 1 oom_kill 1 As this is a change in behaviour, this can be reverted to the old behaviour by mounting with the `memory_localevents' flag set. However, we use the new behaviour by default as there's a lack of evidence that there are any current users of memory.events that would find this change undesirable. akpm: this is a behaviour change, so Cc:stable. THis is so that forthcoming distros which use cgroup v2 are more likely to pick up the revised behaviour. Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Dennis Zhou Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..426a0026225c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1810,11 +1810,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_memory_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_param_specs[] = { - fsparam_flag ("nsdelegate", Opt_nsdelegate), + fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("memory_localevents", Opt_memory_localevents), {} }; @@ -1837,6 +1839,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_memory_localevents: + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1848,6 +1853,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; } } @@ -1855,6 +1865,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + seq_puts(seq, ",memory_localevents"); return 0; } @@ -6325,7 +6337,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); -- cgit v1.2.3 From 98af37d624ed8c83f1953b1b6b2f6866011fc064 Mon Sep 17 00:00:00 2001 From: Zhenliang Wei Date: Fri, 31 May 2019 22:30:52 -0700 Subject: kernel/signal.c: trace_signal_deliver when signal_group_exit In the fixes commit, removing SIGKILL from each thread signal mask and executing "goto fatal" directly will skip the call to "trace_signal_deliver". At this point, the delivery tracking of the SIGKILL signal will be inaccurate. Therefore, we need to add trace_signal_deliver before "goto fatal" after executing sigdelset. Note: SEND_SIG_NOINFO matches the fact that SIGKILL doesn't have any info. Link: http://lkml.kernel.org/r/20190425025812.91424-1-weizhenliang@huawei.com Fixes: cf43a757fd4944 ("signal: Restore the stop PTRACE_EVENT_EXIT") Signed-off-by: Zhenliang Wei Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Ivan Delalande Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Deepa Dinamani Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d7b9d14ac80d..328a01e1a2f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2485,6 +2485,8 @@ relock: if (signal_group_exit(signal)) { ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); recalc_sigpending(); goto fatal; } -- cgit v1.2.3 From a61373476127edac8bcc5ee9d68a74dc1b864f53 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 May 2019 12:45:18 +0200 Subject: PM: sleep: Add kerneldoc comments to some functions Add kerneldoc comments to pm_suspend_via_firmware(), pm_resume_via_firmware() and pm_suspend_via_s2idle() to explain what they do. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..43d869db6c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -62,6 +62,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +/** + * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ bool pm_suspend_via_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; -- cgit v1.2.3 From ec527c318036a65a083ef68d8ba95789d2212246 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 30 May 2019 00:09:39 +0200 Subject: x86/power: Fix 'nosmt' vs hibernation triple fault during resume As explained in 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") we always, no matter what, have to bring up x86 HT siblings during boot at least once in order to avoid first MCE bringing the system to its knees. That means that whenever 'nosmt' is supplied on the kernel command-line, all the HT siblings are as a result sitting in mwait or cpudile after going through the online-offline cycle at least once. This causes a serious issue though when a kernel, which saw 'nosmt' on its commandline, is going to perform resume from hibernation: if the resume from the hibernated image is successful, cr3 is flipped in order to point to the address space of the kernel that is being resumed, which in turn means that all the HT siblings are all of a sudden mwaiting on address which is no longer valid. That results in triple fault shortly after cr3 is switched, and machine reboots. Fix this by always waking up all the SMT siblings before initiating the 'restore from hibernation' process; this guarantees that all the HT siblings will be properly carried over to the resumed kernel waiting in resume_play_dead(), and acted upon accordingly afterwards, based on the target kernel configuration. Symmetricaly, the resumed kernel has to push the SMT siblings to mwait again in case it has SMT disabled; this means it has to online all the siblings when resuming (so that they come out of hlt) and offline them again to let them reach mwait. Cc: 4.19+ # v4.19+ Debugged-by: Thomas Gleixner Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") Signed-off-by: Jiri Kosina Acked-by: Pavel Machek Reviewed-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 4 ++-- kernel/power/hibernate.c | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f2ef10460698..077fde6fb953 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2061,7 +2061,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2093,7 +2093,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static int cpuhp_smt_enable(void) +int cpuhp_smt_enable(void) { int cpu, ret = 0; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..b65635753e8e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -257,6 +257,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, (kps % 1000) / 10); } +__weak int arch_resume_nosmt(void) +{ + return 0; +} + /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -324,6 +329,10 @@ static int create_image(int platform_mode) Enable_cpus: suspend_enable_secondary_cpus(); + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); + Platform_finish: platform_finish(platform_mode); -- cgit v1.2.3 From c732327f04a3818f35fa97d07b1d64d31b691d78 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Jun 2019 15:18:43 +0200 Subject: signal: improve comments Improve the comments for pidfd_send_signal(). First, the comment still referred to a file descriptor for a process as a "task file descriptor" which stems from way back at the beginning of the discussion. Replace this with "pidfd" for consistency. Second, the wording for the explanation of the arguments to the syscall was a bit inconsistent, e.g. some used the past tense some used present tense. Make the wording more consistent. Signed-off-by: Christian Brauner --- kernel/signal.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 328a01e1a2f0..d622eac9d169 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3621,12 +3621,11 @@ static struct pid *pidfd_to_pid(const struct file *file) } /** - * sys_pidfd_send_signal - send a signal to a process through a task file - * descriptor - * @pidfd: the file descriptor of the process - * @sig: signal to be sent - * @info: the signal info - * @flags: future flags to be passed + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd: file descriptor of the process + * @sig: signal to send + * @info: signal info + * @flags: future flags * * The syscall currently only signals via PIDTYPE_PID which covers * kill(, . It does not signal threads or process -- cgit v1.2.3 From 9c92ab61914157664a2fbdf926df0eb937838e45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 07:17:56 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 282 Based on 1 normalized pattern(s): this software is licensed under the terms of the gnu general public license version 2 as published by the free software foundation and may be copied distributed and modified under those terms this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 285 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141900.642774971@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/cpu_pm.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 67b02e138a47..cbca6879ab7d 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Google, Inc. * * Author: * Colin Cross - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include -- cgit v1.2.3 From 5b497af42fab12cadc0e29bcb7052cf9963603f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 07:18:09 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 295 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of version 2 of the gnu general public license as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 64 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141901.894819585@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/arraymap.c | 10 +--------- kernel/bpf/devmap.c | 10 +--------- kernel/bpf/disasm.c | 10 +--------- kernel/bpf/disasm.h | 10 +--------- kernel/bpf/hashtab.c | 10 +--------- kernel/bpf/helpers.c | 10 +--------- kernel/bpf/syscall.c | 10 +--------- kernel/bpf/verifier.c | 10 +--------- 8 files changed, 8 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 584636c9e2eb..262a321f58a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..15dbc15c5b0c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ /* Devmaps primary use is as a backend map for XDP BPF helper call diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d9ce383c0f9c..b44d8c447afd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e1324a834a24..e546b18d27da 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __BPF_DISASM_H__ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0f2708fde5f7..583df5cb302d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4266ffde07ca..5e28718928ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..ef63d26622f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..d15cc4fafa89 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include -- cgit v1.2.3 From 4505153954fdb1465d2b178288a9bf646f2a2166 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 16:57:47 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 333 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not write to the free software foundation inc 59 temple place suite 330 boston ma 02111 1307 usa extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 136 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190530000436.384967451@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/dma/debug.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index badd77670d00..099002d84f46 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 Advanced Micro Devices, Inc. * * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "DMA-API: " fmt -- cgit v1.2.3 From ddc64d0ac97814fcc42ed90a2ea0c69658806c67 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 May 2019 01:09:24 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 363 Based on 1 normalized pattern(s): released under terms in gpl version 2 see copying extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531081035.689962394@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index cf727d77c6c6..8ebd0fa826f8 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ /* The 'cpumap' is primarily used as a backend map for XDP BPF helper -- cgit v1.2.3 From 55716d26439f5c4008b0bcb7f17d1f7c0d8fbcfc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:42 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 428 Based on 1 normalized pattern(s): this file is released under the gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 68 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190114.292346262@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 3 +-- kernel/power/main.c | 4 +--- kernel/power/snapshot.c | 4 +--- kernel/power/suspend.c | 3 +-- kernel/power/suspend_test.c | 3 +-- kernel/power/swap.c | 4 +--- kernel/power/user.c | 4 +--- 7 files changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 97522630b1b6..8fc054e5c501 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * @@ -6,8 +7,6 @@ * Copyright (c) 2004 Pavel Machek * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/main.c b/kernel/power/main.c index 4f43e724f6eb..bdbd605c4215 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * */ #include diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bc9558ab1e5b..83105874f255 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/snapshot.c * @@ -5,9 +6,6 @@ * * Copyright (C) 1998-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..3c57e206f3e8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend.c - Suspend to RAM and standby functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 6a897e8b2a88..60564b58de07 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. * * Copyright (c) 2009 Pavel Machek - * - * This file is released under the GPLv2. */ #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7f6c1a288d3..e1912ad13bdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * @@ -7,9 +8,6 @@ * Copyright (C) 1998,2001-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki * Copyright (C) 2010-2012 Bojan Smojver - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/user.c b/kernel/power/user.c index cb24e840a3e6..77438954cc2b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * */ #include -- cgit v1.2.3 From 767a67b0b35520348dc3b28dcba06454b0f9023d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:44 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 430 Based on 1 normalized pattern(s): distribute under gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 8 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190114.475576622@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/softirq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c3382378d94..a6b81c6b6bff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/softirq.c * * Copyright (C) 1992 Linus Torvalds * - * Distribute under GPLv2. - * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) */ -- cgit v1.2.3 From 3e45610181bcc2c68d343a62c1d028890160ef79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:50 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 436 Based on 1 normalized pattern(s): distributed under the terms of the gnu gpl version 2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 2 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190115.032570679@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/locking/semaphore.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 561acdd39960..d9dd94defc0a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Intel Corporation * Author: Matthew Wilcox * - * Distributed under the terms of the GNU GPL, version 2 - * * This file implements counting semaphores. * A counting semaphore may be acquired 'n' times before sleeping. * See mutex.c for single-acquisition sleeping locks which enforce -- cgit v1.2.3 From b886d83c5b621abc84ff9616f14c529be3f6b147 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:55 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 441 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation version 2 of the license extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 315 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Armijn Hemel Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190115.503150771@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/async.c | 6 +----- kernel/backtracetest.c | 6 +----- kernel/latencytop.c | 6 +----- kernel/nsproxy.c | 6 +----- kernel/sched/cpudeadline.c | 6 +----- kernel/sched/cpupri.c | 6 +----- kernel/ucount.c | 7 +------ kernel/user_namespace.c | 7 +------ kernel/utsname.c | 6 +----- kernel/utsname_sysctl.c | 6 +----- 10 files changed, 10 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 12c332e4e13e..4f9c1d614016 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * async.c: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a563c8fdad0d..a2a97fa3071b 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple stack backtrace regression test module * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 871734ea2f04..e3acead004e6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * latencytop.c: Latency display infrastructure * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f6c5d330059a..c815f58e6bc0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2006 IBM Corporation * * Author: Serge Hallyn * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * * Jun 2006 - namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..ec4e4a9aab5f 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpudl.c * * Global CPU deadline management * * Author: Juri Lelli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..9c6480e6d62d 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpupri.c * @@ -20,11 +21,6 @@ * searches). For tasks with affinity restrictions, the algorithm has a * worst case complexity of O(min(102, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" diff --git a/kernel/ucount.c b/kernel/ucount.c index f48d1b6376a4..feb128c7b5d9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..0eff45ce7703 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index dcd6be1996fe..f0e491193009 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004 IBM Corporation * * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 258033d62cb3..3732c888a949 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include -- cgit v1.2.3 From cee0c33c546a93957a52ae9ab6bebadbee765ec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 5 Jun 2019 09:54:34 -0700 Subject: cgroup: css_task_iter_skip()'d iterators must be advanced before accessed b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") introduced css_task_iter_skip() which is used to fix task iterations skipping dying threadgroup leaders with live threads. Skipping is implemented as a subportion of full advancing but css_task_iter_next() forgot to fully advance a skipped iterator before determining the next task to visit causing it to return invalid task pointers. Fix it by making css_task_iter_next() fully advance the iterator if it has been skipped since the previous iteration. Signed-off-by: Tejun Heo Reported-by: syzbot Link: http://lkml.kernel.org/r/00000000000097025d058a7fd785@google.com Fixes: b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a7df319c2e9a..9538a12d42d6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4550,6 +4550,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); + /* @it may be half-advanced by skips, finish advancing */ + if (it->flags & CSS_TASK_ITER_SKIPPED) + css_task_iter_advance(it); + if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); -- cgit v1.2.3 From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Jun 2019 01:48:57 +0200 Subject: bpf: fix unconnected udp hooks Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently to applications as also stated in original motivation in 7828f20e3779 ("Merge branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter two hooks into Cilium to enable host based load-balancing with Kubernetes, I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes typically sets up DNS as a service and is thus subject to load-balancing. Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API is currently insufficient and thus not usable as-is for standard applications shipped with most distros. To break down the issue we ran into with a simple example: # cat /etc/resolv.conf nameserver 147.75.207.207 nameserver 147.75.207.208 For the purpose of a simple test, we set up above IPs as service IPs and transparently redirect traffic to a different DNS backend server for that node: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 The attached BPF program is basically selecting one of the backends if the service IP/port matches on the cgroup hook. DNS breaks here, because the hooks are not transparent enough to applications which have built-in msg_name address checks: # nslookup 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ;; connection timed out; no servers could be reached # dig 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; connection timed out; no servers could be reached For comparison, if none of the service IPs is used, and we tell nslookup to use 8.8.8.8 directly it works just fine, of course: # nslookup 1.1.1.1 8.8.8.8 1.1.1.1.in-addr.arpa name = one.one.one.one. In order to fix this and thus act more transparent to the application, this needs reverse translation on recvmsg() side. A minimal fix for this API is to add similar recvmsg() hooks behind the BPF cgroups static key such that the program can track state and replace the current sockaddr_in{,6} with the original service IP. From BPF side, this basically tracks the service tuple plus socket cookie in an LRU map where the reverse NAT can then be retrieved via map value as one example. Side-note: the BPF cgroups static key should be converted to a per-hook static key in future. Same example after this fix: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 Lookups work fine now: # nslookup 1.1.1.1 1.1.1.1.in-addr.arpa name = one.one.one.one. Authoritative answers can be found from: # dig 1.1.1.1 ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 512 ;; QUESTION SECTION: ;1.1.1.1. IN A ;; AUTHORITY SECTION: . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 ;; Query time: 17 msec ;; SERVER: 147.75.207.207#53(147.75.207.207) ;; WHEN: Tue May 21 12:59:38 UTC 2019 ;; MSG SIZE rcvd: 111 And from an actual packet level it shows that we're using the back end server when talking via 147.75.207.20{7,8} front end: # tcpdump -i any udp [...] 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) [...] In order to be flexible and to have same semantics as in sendmsg BPF programs, we only allow return codes in [1,1] range. In the sendmsg case the program is called if msg->msg_name is present which can be the case in both, connected and unconnected UDP. The former only relies on the sockaddr_in{,6} passed via connect(2) if passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar way to call into the BPF program whenever a non-NULL msg->msg_name was passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note that for TCP case, the msg->msg_name is ignored in the regular recvmsg path and therefore not relevant. For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, the hook is not called. This is intentional as it aligns with the same semantics as in case of TCP cgroup BPF hooks right now. This might be better addressed in future through a different bpf_attach_type such that this case can be distinguished from the regular recvmsg paths, for example. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Acked-by: Martynas Pumputis Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..e8ba3a153691 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: return 0; default: return -EINVAL; @@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..d2c8a6677ac4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5361,9 +5361,12 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: @@ -5380,16 +5383,17 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } return 0; -- cgit v1.2.3 From 54b7b868e826b294687c439b68ec55fe20cafe5b Mon Sep 17 00:00:00 2001 From: Angelo Ruocco Date: Tue, 21 May 2019 10:01:54 +0200 Subject: cgroup: let a symlink too be created with a cftype file This commit enables a cftype to have a symlink (of any name) that points to the file associated with the cftype. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 426a0026225c..155048b0eca2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) +static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf, bool write_link_name) { struct cgroup_subsys *ss = cft->ss; @@ -1471,13 +1471,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, - cft->name); + write_link_name ? cft->link_name : cft->name); } else { - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, write_link_name ? cft->link_name : cft->name, + CGROUP_FILE_NAME_MAX); } return buf; } +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, false); +} + +static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, true); +} + /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -1636,6 +1649,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); + if (cft->flags & CFTYPE_SYMLINKED) + kernfs_remove_by_name(cgrp->kn, + cgroup_link_name(cgrp, cft, name)); } /** @@ -3821,6 +3837,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; + struct kernfs_node *kn_link; struct lock_class_key *key = NULL; int ret; @@ -3851,6 +3868,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, spin_unlock_irq(&cgroup_file_kn_lock); } + if (cft->flags & CFTYPE_SYMLINKED) { + kn_link = kernfs_create_link(cgrp->kn, + cgroup_link_name(cgrp, cft, name), + kn); + if (IS_ERR(kn_link)) + return PTR_ERR(kn_link); + } + return 0; } -- cgit v1.2.3 From cf8929885de318c0bf73438c9e5dde59d6536f7c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jun 2019 03:35:41 -0600 Subject: cgroup/bfq: revert bfq.weight symlink change There's some discussion on how to do this the best, and Tejun prefers that BFQ just create the file itself instead of having cgroups support a symlink feature. Hence revert commit 54b7b868e826 and 19e9da9e86c4 for 5.2, and this can be done properly for 5.3. Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 155048b0eca2..426a0026225c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf, bool write_link_name) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { struct cgroup_subsys *ss = cft->ss; @@ -1471,26 +1471,13 @@ static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, - write_link_name ? cft->link_name : cft->name); + cft->name); } else { - strscpy(buf, write_link_name ? cft->link_name : cft->name, - CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) -{ - return cgroup_fill_name(cgrp, cft, buf, false); -} - -static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) -{ - return cgroup_fill_name(cgrp, cft, buf, true); -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -1649,9 +1636,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); - if (cft->flags & CFTYPE_SYMLINKED) - kernfs_remove_by_name(cgrp->kn, - cgroup_link_name(cgrp, cft, name)); } /** @@ -3837,7 +3821,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; - struct kernfs_node *kn_link; struct lock_class_key *key = NULL; int ret; @@ -3868,14 +3851,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, spin_unlock_irq(&cgroup_file_kn_lock); } - if (cft->flags & CFTYPE_SYMLINKED) { - kn_link = kernfs_create_link(cgrp->kn, - cgroup_link_name(cgrp, cft, name), - kn); - if (IS_ERR(kn_link)) - return PTR_ERR(kn_link); - } - return 0; } -- cgit v1.2.3 From c596687a008b579c503afb7a64fcacc7270fae9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 10 Jun 2019 09:08:27 -0700 Subject: cgroup: Fix css_task_iter_advance_css_set() cset skip condition While adding handling for dying task group leaders c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") added an inverted cset skip condition to css_task_iter_advance_css_set(). It should skip cset if it's completely empty but was incorrectly testing for the inverse condition for the dying_tasks list. Fix it. Signed-off-by: Tejun Heo Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") Reported-by: syzbot+d4bba5ccd4f9a2a68681@syzkaller.appspotmail.com --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9538a12d42d6..6420ff87d72c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4401,7 +4401,7 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; -- cgit v1.2.3 From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Sat, 8 Jun 2019 12:54:19 -0700 Subject: bpf: lpm_trie: check left child of last leftmost node for NULL If the leftmost parent node of the tree has does not have a child on the left side, then trie_get_next_key (and bpftool map dump) will not look at the child on the right. This leads to the traversal missing elements. Lookup is not affected. Update selftest to handle this case. Reproducer: bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \ value 1 entries 256 name test_lpm flags 1 bpftool map update pinned /sys/fs/bpf/lpm key 8 0 0 0 0 0 value 1 bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0 0 128 value 2 bpftool map dump pinned /sys/fs/bpf/lpm Returns only 1 element. (2 expected) Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE") Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..864e2a496376 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -716,9 +716,14 @@ find_leftmost: * have exact two children, so this function will never return NULL. */ for (node = search_root; node;) { - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + if (node->flags & LPM_TREE_NODE_FLAG_IM) { + node = rcu_dereference(node->child[0]); + } else { next_node = node; - node = rcu_dereference(node->child[0]); + node = rcu_dereference(node->child[0]); + if (!node) + node = rcu_dereference(next_node->child[1]); + } } do_copy: next_key->prefixlen = next_node->prefixlen; -- cgit v1.2.3 From f6581f5b55141a95657ef5742cf6a6bfa20a109f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 May 2019 13:31:57 +0200 Subject: ptrace: restore smp_rmb() in __ptrace_may_access() Restore the read memory barrier in __ptrace_may_access() that was deleted a couple years ago. Also add comments on this barrier and the one it pairs with to explain why they're there (as far as I understand). Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks") Cc: stable@vger.kernel.org Acked-by: Kees Cook Acked-by: Oleg Nesterov Signed-off-by: Jann Horn Signed-off-by: Eric W. Biederman --- kernel/cred.c | 9 +++++++++ kernel/ptrace.c | 10 ++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..07e069d00696 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -450,6 +450,15 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + /* + * If a task drops privileges and becomes nondumpable, + * the dumpability change must become visible before + * the credential change; otherwise, a __ptrace_may_access() + * racing with this change may be able to attach to a task it + * shouldn't be able to attach to (as if the task had dropped + * privileges without becoming nondumpable). + * Pairs with a read barrier in __ptrace_may_access(). + */ smp_wmb(); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 02c6528ead5c..c9b4646ad375 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -323,6 +323,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); + /* + * If a task drops privileges and becomes nondumpable (through a syscall + * like setresuid()) while we are trying to access it, we must ensure + * that the dumpability is read after the credentials; otherwise, + * we may be able to attach to a task that we shouldn't be able to + * attach to (as if the task had dropped privileges without becoming + * nondumpable). + * Pairs with a write barrier in commit_creds(). + */ + smp_rmb(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && -- cgit v1.2.3 From d477f8c202d1f0d4791ab1263ca7657bbe5cf79e Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Wed, 12 Jun 2019 11:50:48 -0400 Subject: cpuset: restore sanity to cpuset_cpus_allowed_fallback() In the case that a process is constrained by taskset(1) (i.e. sched_setaffinity(2)) to a subset of available cpus, and all of those are subsequently offlined, the scheduler will set tsk->cpus_allowed to the current value of task_cs(tsk)->effective_cpus. This is done via a call to do_set_cpus_allowed() in the context of cpuset_cpus_allowed_fallback() made by the scheduler when this case is detected. This is the only call made to cpuset_cpus_allowed_fallback() in the latest mainline kernel. However, this is not sane behavior. I will demonstrate this on a system running the latest upstream kernel with the following initial configuration: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,fffffff Cpus_allowed_list: 0-63 (Where cpus 32-63 are provided via smt.) If we limit our current shell process to cpu2 only and then offline it and reonline it: # taskset -p 4 $$ pid 2272's current affinity mask: ffffffffffffffff pid 2272's new affinity mask: 4 # echo off > /sys/devices/system/cpu/cpu2/online # dmesg | tail -3 [ 2195.866089] process 2272 (bash) no longer affine to cpu2 [ 2195.872700] IRQ 114: no longer affine to CPU2 [ 2195.879128] smpboot: CPU 2 is now offline # echo on > /sys/devices/system/cpu/cpu2/online # dmesg | tail -1 [ 2617.043572] smpboot: Booting Node 0 Processor 2 APIC 0x4 We see that our current process now has an affinity mask containing every cpu available on the system _except_ the one we originally constrained it to: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,fffffffb Cpus_allowed_list: 0-1,3-63 This is not sane behavior, as the scheduler can now not only place the process on previously forbidden cpus, it can't even schedule it on the cpu it was originally constrained to! Other cases result in even more exotic affinity masks. Take for instance a process with an affinity mask containing only cpus provided by smt at the moment that smt is toggled, in a configuration such as the following: # taskset -p f000000000 $$ # grep -i cpu /proc/$$/status Cpus_allowed: 000000f0,00000000 Cpus_allowed_list: 36-39 A double toggle of smt results in the following behavior: # echo off > /sys/devices/system/cpu/smt/control # echo on > /sys/devices/system/cpu/smt/control # grep -i cpus /proc/$$/status Cpus_allowed: ffffff00,ffffffff Cpus_allowed_list: 0-31,40-63 This is even less sane than the previous case, as the new affinity mask excludes all smt-provided cpus with ids less than those that were previously in the affinity mask, as well as those that were actually in the mask. With this patch applied, both of these cases end in the following state: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,ffffffff Cpus_allowed_list: 0-63 The original policy is discarded. Though not ideal, it is the simplest way to restore sanity to this fallback case without reinventing the cpuset wheel that rolls down the kernel just fine in cgroup v2. A user who wishes for the previous affinity mask to be restored in this fallback case can use that mechanism instead. This patch modifies scheduler behavior by instead resetting the mask to task_cs(tsk)->cpus_allowed by default, and cpu_possible mask in legacy mode. I tested the cases above on both modes. Note that the scheduler uses this fallback mechanism if and only if _every_ other valid avenue has been traveled, and it is the last resort before calling BUG(). Suggested-by: Waiman Long Suggested-by: Phil Auld Signed-off-by: Joel Savitz Acked-by: Phil Auld Acked-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..515525ff1cfd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + **/ + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + do_set_cpus_allowed(tsk, is_in_v2_mode() ? + task_cs(tsk)->cpus_allowed : cpu_possible_mask); rcu_read_unlock(); /* -- cgit v1.2.3 From 2e3f139e8ecebf177fe01299285a56855e93fb84 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:21 -0700 Subject: mm/devm_memremap_pages: introduce devm_memunmap_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new devm_release_action() facility to allow devm_memremap_pages_release() to be manually triggered. Link: http://lkml.kernel.org/r/155727337088.292046.5774214552136776763.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Jérôme Glisse" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 1490e63f69a9..715b434bd316 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -271,6 +271,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) } EXPORT_SYMBOL_GPL(devm_memremap_pages); +void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + devm_release_action(dev, devm_memremap_pages_release, pgmap); +} +EXPORT_SYMBOL_GPL(devm_memunmap_pages); + unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ -- cgit v1.2.3 From 50f44ee7248ad2f7984ef081974a6ecd09724b3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:33 -0700 Subject: mm/devm_memremap_pages: fix final page put race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logan noticed that devm_memremap_pages_release() kills the percpu_ref drops all the page references that were acquired at init and then immediately proceeds to unplug, arch_remove_memory(), the backing pages for the pagemap. If for some reason device shutdown actually collides with a busy / elevated-ref-count page then arch_remove_memory() should be deferred until after that reference is dropped. As it stands the "wait for last page ref drop" happens *after* devm_memremap_pages_release() returns, which is obviously too late and can lead to crashes. Fix this situation by assigning the responsibility to wait for the percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup() callback. Implement the new cleanup callback for all devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma. Link: http://lkml.kernel.org/r/155727339156.292046.5432007428235387859.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Dan Williams Reported-by: Logan Gunthorpe Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: "Jérôme Glisse" Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 715b434bd316..6e1970719dc2 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data) pgmap->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); + pgmap->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data) * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true * - * 3/ pgmap->ref must be 'live' on entry and will be killed at - * devm_memremap_pages_release() time, or if this routine fails. + * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped + * at devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - if (!pgmap->ref || !pgmap->kill) + if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); + } align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } is_ram = region_intersects(align_start, align_size, @@ -267,6 +272,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap_array_delete(res); err_array: pgmap->kill(pgmap->ref); + pgmap->cleanup(pgmap->ref); + return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); -- cgit v1.2.3 From e3ff9c3678b4d80e22d2557b68726174578eaf52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Jun 2019 21:40:45 +0200 Subject: timekeeping: Repair ktime_get_coarse*() granularity Jason reported that the coarse ktime based time getters advance only once per second and not once per tick as advertised. The code reads only the monotonic base time, which advances once per second. The nanoseconds are accumulated on every tick in xtime_nsec up to a second and the regular time getters take this nanoseconds offset into account, but the ktime_get_coarse*() implementation fails to do so. Add the accumulated xtime_nsec value to the monotonic base time to get the proper per tick advancing coarse tinme. Fixes: b9ff604cff11 ("timekeeping: Add ktime_get_coarse_with_offset") Reported-by: Jason A. Donenfeld Signed-off-by: Thomas Gleixner Tested-by: Jason A. Donenfeld Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Clemens Ladisch Cc: Sultan Alsawaf Cc: Waiman Long Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1906132136280.1791@nanos.tec.linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 85f5912d8f70..44b726bab4bd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; + u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); - return base; - + return base + nsecs; } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); -- cgit v1.2.3 From becf33f694dc50656766e0fde8883437d5c8d4b4 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Mon, 10 Jun 2019 13:00:16 +0900 Subject: tracing: Fix out-of-range read in trace_stack_print() Puts range check before dereferencing the pointer. Reproducer: # echo stacktrace > trace_options # echo 1 > events/enable # cat trace > /dev/null KASAN report: ================================================================== BUG: KASAN: use-after-free in trace_stack_print+0x26b/0x2c0 Read of size 8 at addr ffff888069d20000 by task cat/1953 CPU: 0 PID: 1953 Comm: cat Not tainted 5.2.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 Call Trace: dump_stack+0x8a/0xce print_address_description+0x60/0x224 ? trace_stack_print+0x26b/0x2c0 ? trace_stack_print+0x26b/0x2c0 __kasan_report.cold+0x1a/0x3e ? trace_stack_print+0x26b/0x2c0 kasan_report+0xe/0x20 trace_stack_print+0x26b/0x2c0 print_trace_line+0x6ea/0x14d0 ? tracing_buffers_read+0x700/0x700 ? trace_find_next_entry_inc+0x158/0x1d0 s_show+0xea/0x310 seq_read+0xaa7/0x10e0 ? seq_escape+0x230/0x230 __vfs_read+0x7c/0x100 vfs_read+0x16c/0x3a0 ksys_read+0x121/0x240 ? kernel_write+0x110/0x110 ? perf_trace_sys_enter+0x8a0/0x8a0 ? syscall_slow_exit_work+0xa9/0x410 do_syscall_64+0xb7/0x390 ? prepare_exit_to_usermode+0x165/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f867681f910 Code: b6 fe ff ff 48 8d 3d 0f be 08 00 48 83 ec 08 e8 06 db 01 00 66 0f 1f 44 00 00 83 3d f9 2d 2c 00 00 75 10 b8 00 00 00 00 04 RSP: 002b:00007ffdabf23488 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f867681f910 RDX: 0000000000020000 RSI: 00007f8676cde000 RDI: 0000000000000003 RBP: 00007f8676cde000 R08: ffffffffffffffff R09: 0000000000000000 R10: 0000000000000871 R11: 0000000000000246 R12: 00007f8676cde000 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000000ec0 Allocated by task 1214: save_stack+0x1b/0x80 __kasan_kmalloc.constprop.0+0xc2/0xd0 kmem_cache_alloc+0xaf/0x1a0 getname_flags+0xd2/0x5b0 do_sys_open+0x277/0x5a0 do_syscall_64+0xb7/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 1214: save_stack+0x1b/0x80 __kasan_slab_free+0x12c/0x170 kmem_cache_free+0x8a/0x1c0 putname+0xe1/0x120 do_sys_open+0x2c5/0x5a0 do_syscall_64+0xb7/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888069d20000 which belongs to the cache names_cache of size 4096 The buggy address is located 0 bytes inside of 4096-byte region [ffff888069d20000, ffff888069d21000) The buggy address belongs to the page: page:ffffea0001a74800 refcount:1 mapcount:0 mapping:ffff88806ccd1380 index:0x0 compound_mapcount: 0 flags: 0x100000000010200(slab|head) raw: 0100000000010200 dead000000000100 dead000000000200 ffff88806ccd1380 raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888069d1ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888069d1ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888069d20000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888069d20080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888069d20100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Link: http://lkml.kernel.org/r/20190610040016.5598-1-devel@etsukata.com Fixes: 4285f2fcef80 ("tracing: Remove the ULONG_MAX stack trace hackery") Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 54373d93e251..ba751f993c3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "\n"); - for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { + for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) { if (trace_seq_has_overflowed(s)) break; -- cgit v1.2.3 From cbdaeaf050b730ea02e9ab4ff844ce54d85dbe1d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 5 Jun 2019 13:11:58 +0200 Subject: tracing: avoid build warning with HAVE_NOP_MCOUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting HAVE_NOP_MCOUNT enables -mnop-mcount (if gcc supports it) and sets CC_USING_NOP_MCOUNT. Reuse __is_defined (which is suitable for testing CC_USING_* defines) to avoid conditional compilation and fix the following gcc 9 warning on s390: kernel/trace/ftrace.c:2514:1: warning: ‘ftrace_code_disable’ defined but not used [-Wunused-function] Link: http://lkml.kernel.org/r/patch.git-1a82d13f33ac.your-ad-here.call-01559732716-ext-6629@work.hours Fixes: 2f4df0017baed ("tracing: Add -mcount-nop option support") Signed-off-by: Vasily Gorbik Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12aff849c04..e77a6c92620f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2935,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; -#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) + if (!__is_defined(CC_USING_NOP_MCOUNT) && + !ftrace_code_disable(mod, p)) break; -#endif update_cnt++; } -- cgit v1.2.3 From ff585c5b9a27e64084c84e2ddf24fd00bf8dcfc1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 14 Jun 2019 23:32:10 +0800 Subject: tracing: Make two symbols static Fix sparse warnings: kernel/trace/trace.c:6927:24: warning: symbol 'get_tracing_log_err' was not declared. Should it be static? kernel/trace/trace.c:8196:15: warning: symbol 'trace_instance_dir' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20190614153210.24424-1-yuehaibing@huawei.com Acked-by: Tom Zanussi Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c80521fd436..83e08b78dbee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6923,7 +6923,7 @@ struct tracing_log_err { static DEFINE_MUTEX(tracing_err_log_lock); -struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) { struct tracing_log_err *err; @@ -8192,7 +8192,7 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; -struct dentry *trace_instance_dir; +static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); -- cgit v1.2.3 From f01098c74b5219f3969d4750eeed1a36bfc038e3 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Fri, 14 Jun 2019 16:40:25 +0900 Subject: tracing/uprobe: Fix NULL pointer dereference in trace_uprobe_create() Just like the case of commit 8b05a3a7503c ("tracing/kprobes: Fix NULL pointer dereference in trace_kprobe_create()"), writing an incorrectly formatted string to uprobe_events can trigger NULL pointer dereference. Reporeducer: # echo r > /sys/kernel/debug/tracing/uprobe_events dmesg: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000079d12067 P4D 8000000079d12067 PUD 7b7ab067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1903 Comm: bash Not tainted 5.2.0-rc3+ #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 RIP: 0010:strchr+0x0/0x30 Code: c0 eb 0d 84 c9 74 18 48 83 c0 01 48 39 d0 74 0f 0f b6 0c 07 3a 0c 06 74 ea 19 c0 83 c8 01 c3 31 c0 c3 0f 1f 84 00 00 00 00 00 <0f> b6 07 89 f2 40 38 f0 75 0e eb 13 0f b6 47 01 48 83 c RSP: 0018:ffffb55fc0403d10 EFLAGS: 00010293 RAX: ffff993ffb793400 RBX: 0000000000000000 RCX: ffffffffa4852625 RDX: 0000000000000000 RSI: 000000000000002f RDI: 0000000000000000 RBP: ffffb55fc0403dd0 R08: ffff993ffb793400 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff993ff9cc1668 R14: 0000000000000001 R15: 0000000000000000 FS: 00007f30c5147700(0000) GS:ffff993ffda00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b628000 CR4: 00000000000006f0 Call Trace: trace_uprobe_create+0xe6/0xb10 ? __kmalloc_track_caller+0xe6/0x1c0 ? __kmalloc+0xf0/0x1d0 ? trace_uprobe_create+0xb10/0xb10 create_or_delete_trace_uprobe+0x35/0x90 ? trace_uprobe_create+0xb10/0xb10 trace_run_command+0x9c/0xb0 trace_parse_run_command+0xf9/0x1eb ? probes_open+0x80/0x80 __vfs_write+0x43/0x90 vfs_write+0x14a/0x2a0 ksys_write+0xa2/0x170 do_syscall_64+0x7f/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: http://lkml.kernel.org/r/20190614074026.8045-1-devel@etsukata.com Cc: stable@vger.kernel.org Fixes: 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for uprobe events") Reviewed-by: Srikar Dronamraju Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index eb7e06b54741..a88c692e3b8a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -443,10 +443,17 @@ static int trace_uprobe_create(int argc, const char **argv) ret = 0; ref_ctr_offset = 0; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') + switch (argv[0][0]) { + case 'r': is_return = true; - else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + + if (argc < 2) return -ECANCELED; if (argv[0][1] == ':') -- cgit v1.2.3 From a4158345ec5acb44cc0a9ef4381e0784c1bc7722 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Fri, 14 Jun 2019 16:40:26 +0900 Subject: tracing/uprobe: Fix obsolete comment on trace_uprobe_create() Commit 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for uprobe events") cleaned up the usage of trace_uprobe_create(), and the function has been no longer used for removing uprobe/uretprobe. Link: http://lkml.kernel.org/r/20190614074026.8045-2-devel@etsukata.com Reviewed-by: Srikar Dronamraju Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a88c692e3b8a..b55906c77ce0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -426,8 +426,6 @@ end: /* * Argument syntax: * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] - * - * - Remove uprobe: -:[GRP/]EVENT */ static int trace_uprobe_create(int argc, const char **argv) { -- cgit v1.2.3 From 9f255b632bf12c4dd7fc31caee89aa991ef75176 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 13 Jun 2019 20:07:22 -0500 Subject: module: Fix livepatch/ftrace module text permissions race It's possible for livepatch and ftrace to be toggling a module's text permissions at the same time, resulting in the following panic: BUG: unable to handle page fault for address: ffffffffc005b1d9 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 3ea0c067 P4D 3ea0c067 PUD 3ea0e067 PMD 3cc13067 PTE 3b8a1061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 1 PID: 453 Comm: insmod Tainted: G O K 5.2.0-rc1-a188339ca5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014 RIP: 0010:apply_relocate_add+0xbe/0x14c Code: fa 0b 74 21 48 83 fa 18 74 38 48 83 fa 0a 75 40 eb 08 48 83 38 00 74 33 eb 53 83 38 00 75 4e 89 08 89 c8 eb 0a 83 38 00 75 43 <89> 08 48 63 c1 48 39 c8 74 2e eb 48 83 38 00 75 32 48 29 c1 89 08 RSP: 0018:ffffb223c00dbb10 EFLAGS: 00010246 RAX: ffffffffc005b1d9 RBX: 0000000000000000 RCX: ffffffff8b200060 RDX: 000000000000000b RSI: 0000004b0000000b RDI: ffff96bdfcd33000 RBP: ffffb223c00dbb38 R08: ffffffffc005d040 R09: ffffffffc005c1f0 R10: ffff96bdfcd33c40 R11: ffff96bdfcd33b80 R12: 0000000000000018 R13: ffffffffc005c1f0 R14: ffffffffc005e708 R15: ffffffff8b2fbc74 FS: 00007f5f447beba8(0000) GS:ffff96bdff900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc005b1d9 CR3: 000000003cedc002 CR4: 0000000000360ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: klp_init_object_loaded+0x10f/0x219 ? preempt_latency_start+0x21/0x57 klp_enable_patch+0x662/0x809 ? virt_to_head_page+0x3a/0x3c ? kfree+0x8c/0x126 patch_init+0x2ed/0x1000 [livepatch_test02] ? 0xffffffffc0060000 do_one_initcall+0x9f/0x1c5 ? kmem_cache_alloc_trace+0xc4/0xd4 ? do_init_module+0x27/0x210 do_init_module+0x5f/0x210 load_module+0x1c41/0x2290 ? fsnotify_path+0x3b/0x42 ? strstarts+0x2b/0x2b ? kernel_read+0x58/0x65 __do_sys_finit_module+0x9f/0xc3 ? __do_sys_finit_module+0x9f/0xc3 __x64_sys_finit_module+0x1a/0x1c do_syscall_64+0x52/0x61 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above panic occurs when loading two modules at the same time with ftrace enabled, where at least one of the modules is a livepatch module: CPU0 CPU1 klp_enable_patch() klp_init_object_loaded() module_disable_ro() ftrace_module_enable() ftrace_arch_code_modify_post_process() set_all_modules_text_ro() klp_write_object_relocations() apply_relocate_add() *patches read-only code* - BOOM A similar race exists when toggling ftrace while loading a livepatch module. Fix it by ensuring that the livepatch and ftrace code patching operations -- and their respective permissions changes -- are protected by the text_mutex. Link: http://lkml.kernel.org/r/ab43d56ab909469ac5d2520c5d944ad6d4abd476.1560474114.git.jpoimboe@redhat.com Reported-by: Johannes Erdfelt Fixes: 444d13ff10fb ("modules: add ro_after_init support") Acked-by: Jessica Yu Reviewed-by: Petr Mladek Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/core.c | 6 ++++++ kernel/trace/ftrace.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 91cd519756d3..2d17e6e364b5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "core.h" #include "patch.h" @@ -730,16 +731,21 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); if (ret) { module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); return ret; } arch_klp_init_object_loaded(patch, obj); module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e77a6c92620f..a89700590485 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -2610,10 +2611,12 @@ static void ftrace_run_update_code(int command) { int ret; + mutex_lock(&text_mutex); + ret = ftrace_arch_code_modify_prepare(); FTRACE_WARN_ON(ret); if (ret) - return; + goto out_unlock; /* * By default we use stop_machine() to modify the code. @@ -2625,6 +2628,9 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); + +out_unlock: + mutex_unlock(&text_mutex); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -5775,6 +5781,7 @@ void ftrace_module_enable(struct module *mod) struct ftrace_page *pg; mutex_lock(&ftrace_lock); + mutex_lock(&text_mutex); if (ftrace_disabled) goto out_unlock; @@ -5836,6 +5843,7 @@ void ftrace_module_enable(struct module *mod) ftrace_arch_code_modify_post_process(); out_unlock: + mutex_unlock(&text_mutex); mutex_unlock(&ftrace_lock); process_cached_mods(mod->name); -- cgit v1.2.3 From 04e03d9a616c19a47178eaca835358610e63a1dd Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 6 Jun 2019 11:17:54 +0800 Subject: ftrace: Fix NULL pointer dereference in free_ftrace_func_mapper() The mapper may be NULL when called from register_ftrace_function_probe() with probe->data == NULL. This issue can be reproduced as follow (it may be covered by compiler optimization sometime): / # cat /sys/kernel/debug/tracing/set_ftrace_filter #### all functions enabled #### / # echo foo_bar:dump > /sys/kernel/debug/tracing/set_ftrace_filter [ 206.949100] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 206.952402] Mem abort info: [ 206.952819] ESR = 0x96000006 [ 206.955326] Exception class = DABT (current EL), IL = 32 bits [ 206.955844] SET = 0, FnV = 0 [ 206.956272] EA = 0, S1PTW = 0 [ 206.956652] Data abort info: [ 206.957320] ISV = 0, ISS = 0x00000006 [ 206.959271] CM = 0, WnR = 0 [ 206.959938] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000419f3a000 [ 206.960483] [0000000000000000] pgd=0000000411a87003, pud=0000000411a83003, pmd=0000000000000000 [ 206.964953] Internal error: Oops: 96000006 [#1] SMP [ 206.971122] Dumping ftrace buffer: [ 206.973677] (ftrace buffer empty) [ 206.975258] Modules linked in: [ 206.976631] Process sh (pid: 281, stack limit = 0x(____ptrval____)) [ 206.978449] CPU: 10 PID: 281 Comm: sh Not tainted 5.2.0-rc1+ #17 [ 206.978955] Hardware name: linux,dummy-virt (DT) [ 206.979883] pstate: 60000005 (nZCv daif -PAN -UAO) [ 206.980499] pc : free_ftrace_func_mapper+0x2c/0x118 [ 206.980874] lr : ftrace_count_free+0x68/0x80 [ 206.982539] sp : ffff0000182f3ab0 [ 206.983102] x29: ffff0000182f3ab0 x28: ffff8003d0ec1700 [ 206.983632] x27: ffff000013054b40 x26: 0000000000000001 [ 206.984000] x25: ffff00001385f000 x24: 0000000000000000 [ 206.984394] x23: ffff000013453000 x22: ffff000013054000 [ 206.984775] x21: 0000000000000000 x20: ffff00001385fe28 [ 206.986575] x19: ffff000013872c30 x18: 0000000000000000 [ 206.987111] x17: 0000000000000000 x16: 0000000000000000 [ 206.987491] x15: ffffffffffffffb0 x14: 0000000000000000 [ 206.987850] x13: 000000000017430e x12: 0000000000000580 [ 206.988251] x11: 0000000000000000 x10: cccccccccccccccc [ 206.988740] x9 : 0000000000000000 x8 : ffff000013917550 [ 206.990198] x7 : ffff000012fac2e8 x6 : ffff000012fac000 [ 206.991008] x5 : ffff0000103da588 x4 : 0000000000000001 [ 206.991395] x3 : 0000000000000001 x2 : ffff000013872a28 [ 206.991771] x1 : 0000000000000000 x0 : 0000000000000000 [ 206.992557] Call trace: [ 206.993101] free_ftrace_func_mapper+0x2c/0x118 [ 206.994827] ftrace_count_free+0x68/0x80 [ 206.995238] release_probe+0xfc/0x1d0 [ 206.995555] register_ftrace_function_probe+0x4a8/0x868 [ 206.995923] ftrace_trace_probe_callback.isra.4+0xb8/0x180 [ 206.996330] ftrace_dump_callback+0x50/0x70 [ 206.996663] ftrace_regex_write.isra.29+0x290/0x3a8 [ 206.997157] ftrace_filter_write+0x44/0x60 [ 206.998971] __vfs_write+0x64/0xf0 [ 206.999285] vfs_write+0x14c/0x2f0 [ 206.999591] ksys_write+0xbc/0x1b0 [ 206.999888] __arm64_sys_write+0x3c/0x58 [ 207.000246] el0_svc_common.constprop.0+0x408/0x5f0 [ 207.000607] el0_svc_handler+0x144/0x1c8 [ 207.000916] el0_svc+0x8/0xc [ 207.003699] Code: aa0003f8 a9025bf5 aa0103f5 f946ea80 (f9400303) [ 207.008388] ---[ end trace 7b6d11b5f542bdf1 ]--- [ 207.010126] Kernel panic - not syncing: Fatal exception [ 207.011322] SMP: stopping secondary CPUs [ 207.013956] Dumping ftrace buffer: [ 207.014595] (ftrace buffer empty) [ 207.015632] Kernel Offset: disabled [ 207.017187] CPU features: 0x002,20006008 [ 207.017985] Memory Limit: none [ 207.019825] ---[ end Kernel panic - not syncing: Fatal exception ]--- Link: http://lkml.kernel.org/r/20190606031754.10798-1-liwei391@huawei.com Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a89700590485..38277af44f5c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4226,10 +4226,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, struct ftrace_func_entry *entry; struct ftrace_func_map *map; struct hlist_head *hhd; - int size = 1 << mapper->hash.size_bits; - int i; + int size, i; + + if (!mapper) + return; if (free_func && mapper->hash.count) { + size = 1 << mapper->hash.size_bits; for (i = 0; i < size; i++) { hhd = &mapper->hash.buckets[i]; hlist_for_each_entry(entry, hhd, hlist) { -- cgit v1.2.3 From d4dd153d551634683fccf8881f606fa9f3dfa1ef Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:13 +0900 Subject: bpf, devmap: Fix premature entry free on destroying map dev_map_free() waits for flush_needed bitmap to be empty in order to ensure all flush operations have completed before freeing its entries. However the corresponding clear_bit() was called before using the entries, so the entries could be used after free. All access to the entries needs to be done before clearing the bit. It seems commit a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") accidentally changed the clear_bit() and memory access order. Note that the problem happens only in __dev_map_flush(), not in dev_map_flush_old(). dev_map_flush_old() is called only after nulling out the corresponding netdev_map entry, so dev_map_free() never frees the entry thus no such race happens there. Fixes: a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..e001fb1a96b1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -291,10 +291,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - __clear_bit(bit, bitmap); - bq = this_cpu_ptr(dev->bulkq); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); + + __clear_bit(bit, bitmap); } } -- cgit v1.2.3 From edabf4d9dd905acd60048ea1579943801e3a4876 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:14 +0900 Subject: bpf, devmap: Add missing bulk queue free dev_map_free() forgot to free bulk queue when freeing its entries. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e001fb1a96b1..a126d95d12de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -186,6 +186,7 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } -- cgit v1.2.3 From 86723c8640633bee4b4588d3c7784ee7a0032f65 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:15 +0900 Subject: bpf, devmap: Add missing RCU read lock on flush .ndo_xdp_xmit() assumes it is called under RCU. For example virtio_net uses RCU to detect it has setup the resources for tx. The assumption accidentally broke when introducing bulk queue in devmap. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Reported-by: David Ahern Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a126d95d12de..1defea4b2755 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -282,6 +282,7 @@ void __dev_map_flush(struct bpf_map *map) unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); u32 bit; + rcu_read_lock(); for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; @@ -297,6 +298,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); } + rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -389,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) int cpu; + rcu_read_lock(); for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); @@ -396,6 +399,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) bq = per_cpu_ptr(dev->bulkq, cpu); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } + rcu_read_unlock(); } } -- cgit v1.2.3 From a8e11e5c5611a9f70470aebeb2c1dd6132f609d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2019 16:22:18 -0700 Subject: sysctl: define proc_do_static_key() Convert proc_dointvec_minmax_bpf_stats() into a more generic helper, since we are going to use jump labels more often. Note that sysctl_bpf_stats_enabled is removed, since it is no longer needed/used. Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 1 - kernel/sysctl.c | 44 +++++++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c473f208a10..080e2bb644cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -int sysctl_bpf_stats_enabled __read_mostly; /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d1008be6173..1beca96fb625 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_BPF_SYSCALL -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = { }, { .procname = "bpf_stats_enabled", - .data = &sysctl_bpf_stats_enabled, - .maxlen = sizeof(sysctl_bpf_stats_enabled), + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_dointvec_minmax_bpf_stats, - .extra1 = &zero, - .extra2 = &one, + .proc_handler = proc_do_static_key, }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) @@ -3374,26 +3367,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - int ret, bpf_stats = *(int *)table->data; - struct ctl_table tmp = *table; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &one, + }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - tmp.data = &bpf_stats; + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { - *(int *)table->data = bpf_stats; - if (bpf_stats) - static_branch_enable(&bpf_stats_enabled_key); + if (val) + static_key_enable(key); else - static_branch_disable(&bpf_stats_enabled_key); + static_key_disable(key); } + mutex_unlock(&static_key_mutex); return ret; } #endif -- cgit v1.2.3 From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Tue, 11 Jun 2019 14:53:04 -0700 Subject: bpf: fix nested bpf tracepoints with per-cpu data BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as they do not increment bpf_prog_active while executing. This enables three levels of nesting, to support - a kprobe or raw tp or perf event, - another one of the above that irq context happens to call, and - another one in nmi context (at most one of which may be a kprobe or perf event). Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Signed-off-by: Matt Mullins Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 100 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..1c9a4745e596 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); - static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_sample_data *sd) @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return perf_event_output(event, sd, regs); } +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; + struct perf_sample_data *sd; + int err; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, sd); + +out: + this_cpu_dec(bpf_trace_nest_level); + return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, - (unsigned long) size, flags, 0); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { -- cgit v1.2.3 From 085ebfe937d7a7a5df1729f35a12d6d655fea68c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 14:37:24 +0200 Subject: perf/core: Fix perf_sample_regs_user() mm check perf_sample_regs_user() uses 'current->mm' to test for the presence of userspace, but this is insufficient, consider use_mm(). A better test is: '!(current->flags & PF_KTHREAD)', exec() clears PF_KTHREAD after it sets the new ->mm but before it drops to userspace for the first time. Possibly obsoletes: bf05fc25f268 ("powerpc/perf: Fix oops when kthread execs user process") Reported-by: Ravi Bangoria Reported-by: Young Xiao <92siuyang@gmail.com> Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: 4018994f3d87 ("perf: Add ability to attach user level registers dump to sample") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..2e32faac5511 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5923,7 +5923,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (current->mm) { + } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; -- cgit v1.2.3 From 40b0b3f8fb2d8f55d13ceed41593d46689a6b496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jun 2019 07:44:46 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 230 Based on 2 normalized pattern(s): this source code is licensed under the gnu general public license version 2 see the file copying for more details this source code is licensed under general public license version 2 see extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 52 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Allison Randal Reviewed-by: Alexios Zavras Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190602204653.449021192@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/crash_core.c | 4 +--- kernel/kexec.c | 4 +--- kernel/kexec_core.c | 4 +--- kernel/kexec_file.c | 4 +--- 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 093c9f917ed0..9f1557b98468 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * crash.c - kernel crash support code. * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa..1b018f1a6e0d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fd5c95ff9251..d5870723b8ad 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 072b6ee55e3f..ef7b951a8087 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec: kexec_file_load system call * * Copyright (C) 2014 Red Hat Inc. * Authors: * Vivek Goyal - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- cgit v1.2.3 From 8092f73c51567470bd79472c6eb25d2e1841fac3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jun 2019 07:45:04 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 248 Based on 1 normalized pattern(s): this file is released under the gpl v2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 3 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Reviewed-by: Armijn Hemel Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190602204655.103854853@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/power/poweroff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7ef6866b521d..6d475281c730 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 */ #include -- cgit v1.2.3 From f85d208658468b1a298f31daddb05a7684969cd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jun 2019 10:10:45 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 451 Based on 1 normalized pattern(s): this file is subject to the terms and conditions of version 2 of the gnu general public license see the file copying in the main directory of the linux distribution for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190604081200.872755311@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 5 +---- kernel/bpf/lpm_trie.c | 5 +---- kernel/cgroup/pids.c | 5 +---- kernel/cgroup/rdma.c | 5 +---- 4 files changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..92a7d0cf8d13 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..1647a3f763a7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Longest prefix match list implementation * * Copyright (c) 2016,2017 Daniel Mack * Copyright (c) 2016 David Herrmann - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index c9960baaa14f..8e513a573fe9 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * @@ -25,10 +26,6 @@ * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 1d75ae7f1cb7..ae042c347c64 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * @@ -5,10 +6,6 @@ * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include -- cgit v1.2.3 From d2912cb15bdda8ba4a5dd73396ad62641af2f520 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jun 2019 10:11:33 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500 Based on 2 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation # extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 4122 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Kate Stewart Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190604081206.933168790@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/inode.c | 5 +---- kernel/compat.c | 5 +---- kernel/sched/debug.c | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 84a80b02db99..cc0d0cf114e3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. @@ -5,10 +6,6 @@ * Authors: * * Daniel Borkmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. */ #include diff --git a/kernel/compat.c b/kernel/compat.c index b5f7063c0db6..a2bc1d6ceb57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/compat.c * @@ -5,10 +6,6 @@ * on 64 bit kernels. * * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 678bfb9bd87f..14c6a8716ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/debug.c * * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include "sched.h" -- cgit v1.2.3 From 9014143bab2f3bc0b9e5db3bc8d00e2a43e50fbd Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sun, 23 Jun 2019 14:27:17 +0300 Subject: fork: don't check parent_tidptr with CLONE_PIDFD Give userspace a cheap and reliable way to tell whether CLONE_PIDFD is supported by the kernel or not. The easiest way is to pass an invalid file descriptor value in parent_tidptr, perform the syscall and verify that parent_tidptr has been changed to a valid file descriptor value. CLONE_PIDFD uses parent_tidptr to return pidfds. CLONE_PARENT_SETTID will use parent_tidptr to return the tid of the parent. The two flags cannot be used together. Old kernels that only support CLONE_PARENT_SETTID will not verify the value pointed to by parent_tidptr. This behavior is unchanged even with the introduction of CLONE_PIDFD. However, if CLONE_PIDFD is specified the kernel will currently check the value pointed to by parent_tidptr before placing the pidfd in the memory pointed to. EINVAL will be returned if the value in parent_tidptr is not 0. If CLONE_PIDFD is supported and fd 0 is closed, then the returned pidfd can and likely will be 0 and parent_tidptr will be unchanged. This means userspace must either check CLONE_PIDFD support beforehand or check that fd 0 is not closed when invoking CLONE_PIDFD. The check for pidfd == 0 was introduced during the v5.2 merge window by commit b3e583825266 ("clone: add CLONE_PIDFD") to ensure that CLONE_PIDFD could be potentially extended by passing in flags through the return argument. However, that extension would look horrible, and with the upcoming introduction of the clone3 syscall in v5.3 there is no need to extend legacy clone syscall this way. (Even if it would need to be extended, CLONE_DETACHED can be reused with CLONE_PIDFD.) So remove the pidfd == 0 check. Userspace that needs to be portable to kernels without CLONE_PIDFD support can then be advised to initialize pidfd to -1 and check the pidfd value returned by CLONE_PIDFD. Fixes: b3e583825266 ("clone: add CLONE_PIDFD") Signed-off-by: Dmitry V. Levin Signed-off-by: Christian Brauner --- kernel/fork.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..39a3adaa4ad1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1822,8 +1822,6 @@ static __latent_entropy struct task_struct *copy_process( } if (clone_flags & CLONE_PIDFD) { - int reserved; - /* * - CLONE_PARENT_SETTID is useless for pidfds and also * parent_tidptr is used to return pidfds. @@ -1834,16 +1832,6 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) return ERR_PTR(-EINVAL); - - /* - * Verify that parent_tidptr is sane so we can potentially - * reuse it later. - */ - if (get_user(reserved, parent_tidptr)) - return ERR_PTR(-EFAULT); - - if (reserved != 0) - return ERR_PTR(-EINVAL); } /* -- cgit v1.2.3 From 913a90bc5a3a06b1f04c337320e9aeee2328dd77 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 4 Jun 2019 09:59:53 +0530 Subject: perf/ioctl: Add check for the sample_period value perf_event_open() limits the sample_period to 63 bits. See: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits") Make ioctl() consistent with it. Also on PowerPC, negative sample_period could cause a recursive PMIs leading to a hang (reported when running perf-fuzzer). Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: maddy@linux.vnet.ibm.com Cc: mpe@ellerman.id.au Fixes: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits") Link: https://lkml.kernel.org/r/20190604042953.914-1-ravi.bangoria@linux.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2e32faac5511..8d1c62df20a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5005,6 +5005,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (perf_event_check_period(event, value)) return -EINVAL; + if (!event->attr.freq && (value & (1ULL << 63))) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; -- cgit v1.2.3 From e321d02db87af7840da29ef833a2a71fc0eab198 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 May 2019 15:08:30 -0700 Subject: perf/x86: Disable extended registers for non-supported PMUs The perf fuzzer caused Skylake machine to crash: [ 9680.085831] Call Trace: [ 9680.088301] [ 9680.090363] perf_output_sample_regs+0x43/0xa0 [ 9680.094928] perf_output_sample+0x3aa/0x7a0 [ 9680.099181] perf_event_output_forward+0x53/0x80 [ 9680.103917] __perf_event_overflow+0x52/0xf0 [ 9680.108266] ? perf_trace_run_bpf_submit+0xc0/0xc0 [ 9680.113108] perf_swevent_hrtimer+0xe2/0x150 [ 9680.117475] ? check_preempt_wakeup+0x181/0x230 [ 9680.122091] ? check_preempt_curr+0x62/0x90 [ 9680.126361] ? ttwu_do_wakeup+0x19/0x140 [ 9680.130355] ? try_to_wake_up+0x54/0x460 [ 9680.134366] ? reweight_entity+0x15b/0x1a0 [ 9680.138559] ? __queue_work+0x103/0x3f0 [ 9680.142472] ? update_dl_rq_load_avg+0x1cd/0x270 [ 9680.147194] ? timerqueue_del+0x1e/0x40 [ 9680.151092] ? __remove_hrtimer+0x35/0x70 [ 9680.155191] __hrtimer_run_queues+0x100/0x280 [ 9680.159658] hrtimer_interrupt+0x100/0x220 [ 9680.163835] smp_apic_timer_interrupt+0x6a/0x140 [ 9680.168555] apic_timer_interrupt+0xf/0x20 [ 9680.172756] The XMM registers can only be collected by PEBS hardware events on the platforms with PEBS baseline support, e.g. Icelake, not software/probe events. Add capabilities flag PERF_PMU_CAP_EXTENDED_REGS to indicate the PMU which support extended registers. For X86, the extended registers are XMM registers. Add has_extended_regs() to check if extended registers are applied. The generic code define the mask of extended registers as 0 if arch headers haven't overridden it. Originally-by: Peter Zijlstra (Intel) Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: 878068ea270e ("perf/x86: Support outputting XMM registers") Link: https://lkml.kernel.org/r/1559081314-9714-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8d1c62df20a7..f85929ce13be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10036,6 +10036,12 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static inline bool has_extended_regs(struct perf_event *event) +{ + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); +} + static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; @@ -10067,12 +10073,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) + ret = -EOPNOTSUPP; + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) { - if (event->destroy) - event->destroy(event); + event_has_any_exclude_flag(event)) ret = -EINVAL; - } + + if (ret && event->destroy) + event->destroy(event); } if (ret) -- cgit v1.2.3 From 1bf72720281770162c87990697eae1ba2f1d917a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 16 May 2019 09:09:35 +0200 Subject: cpu/speculation: Warn on unsupported mitigations= parameter Currently, if the user specifies an unsupported mitigation strategy on the kernel command line, it will be ignored silently. The code will fall back to the default strategy, possibly leaving the system more vulnerable than expected. This may happen due to e.g. a simple typo, or, for a stable kernel release, because not all mitigation strategies have been backported. Inform the user by printing a message. Fixes: 98af8452945c5565 ("cpu/speculation: Add 'mitigations=' cmdline option") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Greg Kroah-Hartman Cc: Ben Hutchings Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190516070935.22546-1-geert@linux-m68k.org --- kernel/cpu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 077fde6fb953..551db494f153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2339,6 +2339,9 @@ static int __init mitigations_parse_cmdline(char *arg) cpu_mitigations = CPU_MITIGATIONS_AUTO; else if (!strcmp(arg, "auto,nosmt")) cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + else + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", + arg); return 0; } -- cgit v1.2.3 From 471a739a47aa7d582f0cdf9d392957d04632bae2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jun 2019 00:20:23 +0200 Subject: PCI: PM: Avoid skipping bus-level PM on platforms without ACPI There are platforms that do not call pm_set_suspend_via_firmware(), so pm_suspend_via_firmware() returns 'false' on them, but the power states of PCI devices (PCIe ports in particular) are changed as a result of powering down core platform components during system-wide suspend. Thus the pm_suspend_via_firmware() checks in pci_pm_suspend_noirq() and pci_pm_resume_noirq() introduced by commit 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to- idle") are not sufficient to determine that devices left in D0 during suspend will remain in D0 during resume and so the bus-level power management can be skipped for them. For this reason, introduce a new global suspend flag, PM_SUSPEND_FLAG_NO_PLATFORM, set it for suspend-to-idle only and replace the pm_suspend_via_firmware() checks mentioned above with checks against this flag. Fixes: 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to-idle") Reported-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Rafael J. Wysocki Tested-by: Mika Westerberg Reviewed-by: Mika Westerberg --- kernel/power/suspend.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 9505101ed2bc..096211299c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -493,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state) pm_suspend_target_state = state; + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + error = platform_suspend_begin(state); if (error) goto Close; -- cgit v1.2.3 From 33d4a5a7a5b4d02915d765064b2319e90a11cbde Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Thu, 27 Jun 2019 11:47:32 +0900 Subject: cpu/hotplug: Fix out-of-bounds read when setting fail state Setting invalid value to /sys/devices/system/cpu/cpuX/hotplug/fail can control `struct cpuhp_step *sp` address, results in the following global-out-of-bounds read. Reproducer: # echo -2 > /sys/devices/system/cpu/cpu0/hotplug/fail KASAN report: BUG: KASAN: global-out-of-bounds in write_cpuhp_fail+0x2cd/0x2e0 Read of size 8 at addr ffffffff89734438 by task bash/1941 CPU: 0 PID: 1941 Comm: bash Not tainted 5.2.0-rc6+ #31 Call Trace: write_cpuhp_fail+0x2cd/0x2e0 dev_attr_store+0x58/0x80 sysfs_kf_write+0x13d/0x1a0 kernfs_fop_write+0x2bc/0x460 vfs_write+0x1e1/0x560 ksys_write+0x126/0x250 do_syscall_64+0xc1/0x390 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f05e4f4c970 The buggy address belongs to the variable: cpu_hotplug_lock+0x98/0xa0 Memory state around the buggy address: ffffffff89734300: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 00 ffffffff89734380: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 00 >ffffffff89734400: 00 00 00 00 fa fa fa fa 00 00 00 00 fa fa fa fa ^ ffffffff89734480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffffff89734500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Add a sanity check for the value written from user space. Fixes: 1db49484f21ed ("smp/hotplug: Hotplug state fail injection") Signed-off-by: Eiichi Tsukata Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/20190627024732.31672-1-devel@etsukata.com --- kernel/cpu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 551db494f153..ef1c565edc5d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1964,6 +1964,9 @@ static ssize_t write_cpuhp_fail(struct device *dev, if (ret) return ret; + if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) + return -EINVAL; + /* * Cannot fail STARTING/DYING callbacks. */ -- cgit v1.2.3 From 6fd2fe494b17bf2dec37b610d23a43a72b16923a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Jun 2019 22:22:09 -0400 Subject: copy_process(): don't use ksys_close() on cleanups anon_inode_getfd() should be used *ONLY* in situations when we are guaranteed to be past the last failure point (including copying the descriptor number to userland, at that). And ksys_close() should not be used for cleanups at all. anon_inode_getfile() is there for all nontrivial cases like that. Just use that... Fixes: b3e583825266 ("clone: add CLONE_PIDFD") Signed-off-by: Al Viro Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- kernel/fork.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 39a3adaa4ad1..399aca51ff75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1712,31 +1712,6 @@ const struct file_operations pidfd_fops = { #endif }; -/** - * pidfd_create() - Create a new pid file descriptor. - * - * @pid: struct pid that the pidfd will reference - * - * This creates a new pid file descriptor with the O_CLOEXEC flag set. - * - * Note, that this function can only be called after the fd table has - * been unshared to avoid leaking the pidfd to the new process. - * - * Return: On success, a cloexec pidfd is returned. - * On error, a negative errno number will be returned. - */ -static int pidfd_create(struct pid *pid) -{ - int fd; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); - - return fd; -} - static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -1774,6 +1749,7 @@ static __latent_entropy struct task_struct *copy_process( int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + struct file *pidfile = NULL; /* * Don't allow sharing the root directory with processes in a different @@ -2046,11 +2022,20 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = pidfd_create(pid); + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; + + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfile)) { + put_unused_fd(pidfd); + goto bad_fork_free_pid; + } + get_pid(pid); /* held by pidfile now */ + retval = put_user(pidfd, parent_tidptr); if (retval) goto bad_fork_put_pidfd; @@ -2168,6 +2153,9 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* past the last point of failure */ + if (pidfile) + fd_install(pidfd, pidfile); init_task_pid_links(p); if (likely(p->pid)) { @@ -2234,8 +2222,10 @@ bad_fork_cancel_cgroup: bad_fork_cgroup_threadgroup_change_end: cgroup_threadgroup_change_end(current); bad_fork_put_pidfd: - if (clone_flags & CLONE_PIDFD) - ksys_close(pidfd); + if (clone_flags & CLONE_PIDFD) { + fput(pidfile); + put_unused_fd(pidfd); + } bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); -- cgit v1.2.3 From d5b844a2cf507fc7642c9ae80a9d585db3065c28 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 27 Jun 2019 10:13:34 +0200 Subject: ftrace/x86: Remove possible deadlock between register_kprobe() and ftrace_run_update_code() The commit 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module text permissions race") causes a possible deadlock between register_kprobe() and ftrace_run_update_code() when ftrace is using stop_machine(). The existing dependency chain (in reverse order) is: -> #1 (text_mutex){+.+.}: validate_chain.isra.21+0xb32/0xd70 __lock_acquire+0x4b8/0x928 lock_acquire+0x102/0x230 __mutex_lock+0x88/0x908 mutex_lock_nested+0x32/0x40 register_kprobe+0x254/0x658 init_kprobes+0x11a/0x168 do_one_initcall+0x70/0x318 kernel_init_freeable+0x456/0x508 kernel_init+0x22/0x150 ret_from_fork+0x30/0x34 kernel_thread_starter+0x0/0xc -> #0 (cpu_hotplug_lock.rw_sem){++++}: check_prev_add+0x90c/0xde0 validate_chain.isra.21+0xb32/0xd70 __lock_acquire+0x4b8/0x928 lock_acquire+0x102/0x230 cpus_read_lock+0x62/0xd0 stop_machine+0x2e/0x60 arch_ftrace_update_code+0x2e/0x40 ftrace_run_update_code+0x40/0xa0 ftrace_startup+0xb2/0x168 register_ftrace_function+0x64/0x88 klp_patch_object+0x1a2/0x290 klp_enable_patch+0x554/0x980 do_one_initcall+0x70/0x318 do_init_module+0x6e/0x250 load_module+0x1782/0x1990 __s390x_sys_finit_module+0xaa/0xf0 system_call+0xd8/0x2d0 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(text_mutex); lock(cpu_hotplug_lock.rw_sem); lock(text_mutex); lock(cpu_hotplug_lock.rw_sem); It is similar problem that has been solved by the commit 2d1e38f56622b9b ("kprobes: Cure hotplug lock ordering issues"). Many locks are involved. To be on the safe side, text_mutex must become a low level lock taken after cpu_hotplug_lock.rw_sem. This can't be achieved easily with the current ftrace design. For example, arm calls set_all_modules_text_rw() already in ftrace_arch_code_modify_prepare(), see arch/arm/kernel/ftrace.c. This functions is called: + outside stop_machine() from ftrace_run_update_code() + without stop_machine() from ftrace_module_enable() Fortunately, the problematic fix is needed only on x86_64. It is the only architecture that calls set_all_modules_text_rw() in ftrace path and supports livepatching at the same time. Therefore it is enough to move text_mutex handling from the generic kernel/trace/ftrace.c into arch/x86/kernel/ftrace.c: ftrace_arch_code_modify_prepare() ftrace_arch_code_modify_post_process() This patch basically reverts the ftrace part of the problematic commit 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module text permissions race"). And provides x86_64 specific-fix. Some refactoring of the ftrace code will be needed when livepatching is implemented for arm or nds32. These architectures call set_all_modules_text_rw() and use stop_machine() at the same time. Link: http://lkml.kernel.org/r/20190627081334.12793-1-pmladek@suse.com Fixes: 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module text permissions race") Acked-by: Thomas Gleixner Reported-by: Miroslav Benes Reviewed-by: Miroslav Benes Reviewed-by: Josh Poimboeuf Signed-off-by: Petr Mladek [ As reviewed by Miroslav Benes , removed return value of ftrace_run_update_code() as it is a void function. ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38277af44f5c..576c41644e77 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,7 +34,6 @@ #include #include #include -#include #include @@ -2611,12 +2610,10 @@ static void ftrace_run_update_code(int command) { int ret; - mutex_lock(&text_mutex); - ret = ftrace_arch_code_modify_prepare(); FTRACE_WARN_ON(ret); if (ret) - goto out_unlock; + return; /* * By default we use stop_machine() to modify the code. @@ -2628,9 +2625,6 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); - -out_unlock: - mutex_unlock(&text_mutex); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -5784,7 +5778,6 @@ void ftrace_module_enable(struct module *mod) struct ftrace_page *pg; mutex_lock(&ftrace_lock); - mutex_lock(&text_mutex); if (ftrace_disabled) goto out_unlock; @@ -5846,7 +5839,6 @@ void ftrace_module_enable(struct module *mod) ftrace_arch_code_modify_post_process(); out_unlock: - mutex_unlock(&text_mutex); mutex_unlock(&ftrace_lock); process_cached_mods(mod->name); -- cgit v1.2.3 From d122ed6288d9821b405b0f84a3937955b9df545f Mon Sep 17 00:00:00 2001 From: Takeshi Misawa Date: Fri, 28 Jun 2019 19:56:40 +0900 Subject: tracing: Fix memory leak in tracing_err_log_open() When tracing_err_log_open() calls seq_open(), allocated memory is not freed. kmemleak report: unreferenced object 0xffff92c0781d1100 (size 128): comm "tail", pid 15116, jiffies 4295163855 (age 22.704s) hex dump (first 32 bytes): 00 f0 08 e5 c0 92 ff ff 00 10 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000000d0687d5>] kmem_cache_alloc+0x11f/0x1e0 [<000000003e3039a8>] seq_open+0x2f/0x90 [<000000008dd36b7d>] tracing_err_log_open+0x67/0x140 [<000000005a431ae2>] do_dentry_open+0x1df/0x3a0 [<00000000a2910603>] vfs_open+0x2f/0x40 [<0000000038b0a383>] path_openat+0x2e8/0x1690 [<00000000fe025bda>] do_filp_open+0x9b/0x110 [<00000000483a5091>] do_sys_open+0x1ba/0x260 [<00000000c558b5fd>] __x64_sys_openat+0x20/0x30 [<000000006881ec07>] do_syscall_64+0x5a/0x130 [<00000000571c2e94>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by calling seq_release() in tracing_err_log_fops.release(). Link: http://lkml.kernel.org/r/20190628105640.GA1863@DESKTOP Fixes: 8a062902be725 ("tracing: Add tracing error log") Reviewed-by: Tom Zanussi Signed-off-by: Takeshi Misawa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 83e08b78dbee..4122ccde6ec2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7126,12 +7126,24 @@ static ssize_t tracing_err_log_write(struct file *file, return count; } +static int tracing_err_log_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, .llseek = seq_lseek, - .release = tracing_release_generic_tr, + .release = tracing_err_log_release, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) -- cgit v1.2.3 From 46cc0b44428d0f0e81f11ea98217fc0edfbeab07 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Tue, 25 Jun 2019 10:29:10 +0900 Subject: tracing/snapshot: Resize spare buffer if size changed Current snapshot implementation swaps two ring_buffers even though their sizes are different from each other, that can cause an inconsistency between the contents of buffer_size_kb file and the current buffer size. For example: # cat buffer_size_kb 7 (expanded: 1408) # echo 1 > events/enable # grep bytes per_cpu/cpu0/stats bytes: 1441020 # echo 1 > snapshot // current:1408, spare:1408 # echo 123 > buffer_size_kb // current:123, spare:1408 # echo 1 > snapshot // current:1408, spare:123 # grep bytes per_cpu/cpu0/stats bytes: 1443700 # cat buffer_size_kb 123 // != current:1408 And also, a similar per-cpu case hits the following WARNING: Reproducer: # echo 1 > per_cpu/cpu0/snapshot # echo 123 > buffer_size_kb # echo 1 > per_cpu/cpu0/snapshot WARNING: WARNING: CPU: 0 PID: 1946 at kernel/trace/trace.c:1607 update_max_tr_single.part.0+0x2b8/0x380 Modules linked in: CPU: 0 PID: 1946 Comm: bash Not tainted 5.2.0-rc6 #20 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 RIP: 0010:update_max_tr_single.part.0+0x2b8/0x380 Code: ff e8 dc da f9 ff 0f 0b e9 88 fe ff ff e8 d0 da f9 ff 44 89 ee bf f5 ff ff ff e8 33 dc f9 ff 41 83 fd f5 74 96 e8 b8 da f9 ff <0f> 0b eb 8d e8 af da f9 ff 0f 0b e9 bf fd ff ff e8 a3 da f9 ff 48 RSP: 0018:ffff888063e4fca0 EFLAGS: 00010093 RAX: ffff888066214380 RBX: ffffffff99850fe0 RCX: ffffffff964298a8 RDX: 0000000000000000 RSI: 00000000fffffff5 RDI: 0000000000000005 RBP: 1ffff1100c7c9f96 R08: ffff888066214380 R09: ffffed100c7c9f9b R10: ffffed100c7c9f9a R11: 0000000000000003 R12: 0000000000000000 R13: 00000000ffffffea R14: ffff888066214380 R15: ffffffff99851060 FS: 00007f9f8173c700(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000714dc0 CR3: 0000000066fa6000 CR4: 00000000000006f0 Call Trace: ? trace_array_printk_buf+0x140/0x140 ? __mutex_lock_slowpath+0x10/0x10 tracing_snapshot_write+0x4c8/0x7f0 ? trace_printk_init_buffers+0x60/0x60 ? selinux_file_permission+0x3b/0x540 ? tracer_preempt_off+0x38/0x506 ? trace_printk_init_buffers+0x60/0x60 __vfs_write+0x81/0x100 vfs_write+0x1e1/0x560 ksys_write+0x126/0x250 ? __ia32_sys_read+0xb0/0xb0 ? do_syscall_64+0x1f/0x390 do_syscall_64+0xc1/0x390 entry_SYSCALL_64_after_hwframe+0x49/0xbe This patch adds resize_buffer_duplicate_size() to check if there is a difference between current/spare buffer sizes and resize a spare buffer if necessary. Link: http://lkml.kernel.org/r/20190625012910.13109-1-devel@etsukata.com Cc: stable@vger.kernel.org Fixes: ad909e21bbe69 ("tracing: Add internal tracing_snapshot() functions") Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4122ccde6ec2..c3aabb576fe5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6719,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; } #endif - if (!tr->allocated_snapshot) { + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, iter->cpu_file); + else ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - break; - } + if (ret < 0) + break; local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) -- cgit v1.2.3 From 97abc889ee296faf95ca0e978340fb7b942a3e32 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 28 Jun 2019 12:06:50 -0700 Subject: signal: remove the wrong signal_pending() check in restore_user_sigmask() This is the minimal fix for stable, I'll send cleanups later. Commit 854a6ed56839 ("signal: Add restore_user_sigmask()") introduced the visible change which breaks user-space: a signal temporary unblocked by set_user_sigmask() can be delivered even if the caller returns success or timeout. Change restore_user_sigmask() to accept the additional "interrupted" argument which should be used instead of signal_pending() check, and update the callers. Eric said: : For clarity. I don't think this is required by posix, or fundamentally to : remove the races in select. It is what linux has always done and we have : applications who care so I agree this fix is needed. : : Further in any case where the semantic change that this patch rolls back : (aka where allowing a signal to be delivered and the select like call to : complete) would be advantage we can do as well if not better by using : signalfd. : : Michael is there any chance we can get this guarantee of the linux : implementation of pselect and friends clearly documented. The guarantee : that if the system call completes successfully we are guaranteed that no : signal that is unblocked by using sigmask will be delivered? Link: http://lkml.kernel.org/r/20190604134117.GA29963@redhat.com Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()") Signed-off-by: Oleg Nesterov Reported-by: Eric Wong Tested-by: Eric Wong Acked-by: "Eric W. Biederman" Acked-by: Arnd Bergmann Acked-by: Deepa Dinamani Cc: Michael Kerrisk Cc: Jens Axboe Cc: Davidlohr Bueso Cc: Jason Baron Cc: Thomas Gleixner Cc: Al Viro Cc: David Laight Cc: [5.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d622eac9d169..edf8915ddd54 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask); * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed in from userland for the syscalls. */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, + bool interrupted) { if (!usigmask) @@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) * Restoring sigmask here can lead to delivering signals that the above * syscalls are intended to block because of the sigmask passed in. */ - if (signal_pending(current)) { + if (interrupted) { current->saved_sigmask = *sigsaved; set_restore_sigmask(); return; -- cgit v1.2.3 From 1bf4580e00a248a2c86269125390eb3648e1877c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 28 Jun 2019 12:07:14 -0700 Subject: fork,memcg: alloc_thread_stack_node needs to set tsk->stack Commit 5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on memcg charge fail") corrected two instances, but there was a third instance of this bug. Without setting tsk->stack, if memcg_charge_kernel_stack fails, it'll execute free_thread_stack() on a dangling pointer. Enterprise kernels are compiled with VMAP_STACK=y so this isn't critical, but custom VMAP_STACK=n builds should have some performance advantage, with the drawback of risking to fail fork because compaction didn't succeed. So as long as VMAP_STACK=n is a supported option it's worth fixing it upstream. Link: http://lkml.kernel.org/r/20190619011450.28048-1-aarcange@redhat.com Fixes: 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting") Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 399aca51ff75..61667909ce83 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -248,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; + if (likely(page)) { + tsk->stack = page_address(page); + return tsk->stack; + } + return NULL; #endif } -- cgit v1.2.3 From 28dd29c06d0dede4b32b2c559cff21955a830928 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Jul 2019 16:01:46 +0200 Subject: fork: return proper negative error code Make sure to return a proper negative error code from copy_process() when anon_inode_getfile() fails with CLONE_PIDFD. Otherwise _do_fork() will not detect an error and get_task_pid() will operator on a nonsensical pointer: R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c R13: 00007ffc15fbb0ff R14: 00007ff07e47e9c0 R15: 0000000000000000 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 7990 Comm: syz-executor290 Not tainted 5.2.0-rc6+ #9 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:get_task_pid+0xe1/0x210 kernel/pid.c:372 Code: 89 ff e8 62 27 5f 00 49 8b 07 44 89 f1 4c 8d bc c8 90 01 00 00 eb 0c e8 0d fe 25 00 49 81 c7 38 05 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 18 00 74 08 4c 89 ff e8 31 27 5f 00 4d 8b 37 e8 f9 47 12 00 RSP: 0018:ffff88808a4a7d78 EFLAGS: 00010203 RAX: 00000000000000a7 RBX: dffffc0000000000 RCX: ffff888088180600 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff88808a4a7d90 R08: ffffffff814fb3a8 R09: ffffed1015d66bf8 R10: ffffed1015d66bf8 R11: 1ffff11015d66bf7 R12: 0000000000041ffc R13: 1ffff11011494fbc R14: 0000000000000000 R15: 000000000000053d FS: 00007ff07e47e700(0000) GS:ffff8880aeb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004b5100 CR3: 0000000094df2000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: _do_fork+0x1b9/0x5f0 kernel/fork.c:2360 __do_sys_clone kernel/fork.c:2454 [inline] __se_sys_clone kernel/fork.c:2448 [inline] __x64_sys_clone+0xc1/0xd0 kernel/fork.c:2448 do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: https://lore.kernel.org/lkml/000000000000e0dc0d058c9e7142@google.com Reported-and-tested-by: syzbot+002e636502bc4b64eb5c@syzkaller.appspotmail.com Fixes: 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups") Cc: Jann Horn Cc: Al Viro Signed-off-by: Christian Brauner --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 61667909ce83..fe83343da24b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2036,6 +2036,7 @@ static __latent_entropy struct task_struct *copy_process( O_RDWR | O_CLOEXEC); if (IS_ERR(pidfile)) { put_unused_fd(pidfd); + retval = PTR_ERR(pidfile); goto bad_fork_free_pid; } get_pid(pid); /* held by pidfile now */ -- cgit v1.2.3 From 6994eefb0053799d2e07cd140df6c2ea106c41ee Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Jul 2019 17:32:23 +0200 Subject: ptrace: Fix ->ptracer_cred handling for PTRACE_TRACEME Fix two issues: When called for PTRACE_TRACEME, ptrace_link() would obtain an RCU reference to the parent's objective credentials, then give that pointer to get_cred(). However, the object lifetime rules for things like struct cred do not permit unconditionally turning an RCU reference into a stable reference. PTRACE_TRACEME records the parent's credentials as if the parent was acting as the subject, but that's not the case. If a malicious unprivileged child uses PTRACE_TRACEME and the parent is privileged, and at a later point, the parent process becomes attacker-controlled (because it drops privileges and calls execve()), the attacker ends up with control over two processes with a privileged ptrace relationship, which can be abused to ptrace a suid binary and obtain root privileges. Fix both of these by always recording the credentials of the process that is requesting the creation of the ptrace relationship: current_cred() can't change under us, and current is the proper subject for access control. This change is theoretically userspace-visible, but I am not aware of any code that it will actually break. Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: Jann Horn Acked-by: Oleg Nesterov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8456b6e2205f..705887f63288 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -79,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, */ static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - rcu_read_lock(); - __ptrace_link(child, new_parent, __task_cred(new_parent)); - rcu_read_unlock(); + __ptrace_link(child, new_parent, current_cred()); } /** -- cgit v1.2.3