From a2775bbc1d58ce630517dfe86090c166f27d719f Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 12 Mar 2019 21:21:26 +0100
Subject: kernel/workqueue: Use __printf markup to silence compiler in function
 'alloc_workqueue'

Silence warnings (triggered at W=1) by adding relevant __printf attributes.

  kernel/workqueue.c:4249:2: warning: function 'alloc_workqueue' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4026d1871407..56b7cf898f10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4208,6 +4208,7 @@ static int init_rescuer(struct workqueue_struct *wq)
 	return 0;
 }
 
+__printf(1, 4)
 struct workqueue_struct *alloc_workqueue(const char *fmt,
 					 unsigned int flags,
 					 int max_active, ...)
-- 
cgit v1.2.3


From 95e0b46fcebd7dbf6850dee96046e4c4ddc7f69c Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 7 Mar 2019 09:16:24 +0800
Subject: audit: fix a memleak caused by auditing load module

module.name will be allocated unconditionally when auditing load
module, and audit_log_start() can fail with other reasons, or
audit_log_exit maybe not called, caused module.name is not freed

so free module.name in audit_free_context and __audit_syscall_exit

unreferenced object 0xffff88af90837d20 (size 8):
  comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s)
  hex dump (first 8 bytes):
    69 78 67 62 65 00 ff ff                          ixgbe...
  backtrace:
    [<0000000008da28fe>] __audit_log_kern_module+0x33/0x80
    [<00000000c1491e61>] load_module+0x64f/0x3850
    [<000000007fc9ae3f>] __do_sys_init_module+0x218/0x250
    [<0000000000d4a478>] do_syscall_64+0x117/0x400
    [<000000004924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<000000007dc331dd>] 0xffffffffffffffff

Fixes: ca86cad7380e3 ("audit: log module name on init_module")
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
[PM: manual merge fixup in __audit_syscall_exit()]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d1eab1d4a930..fa7b8047aab8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -840,6 +840,13 @@ static inline void audit_proctitle_free(struct audit_context *context)
 	context->proctitle.len = 0;
 }
 
+static inline void audit_free_module(struct audit_context *context)
+{
+	if (context->type == AUDIT_KERN_MODULE) {
+		kfree(context->module.name);
+		context->module.name = NULL;
+	}
+}
 static inline void audit_free_names(struct audit_context *context)
 {
 	struct audit_names *n, *next;
@@ -923,6 +930,7 @@ int audit_alloc(struct task_struct *tsk)
 
 static inline void audit_free_context(struct audit_context *context)
 {
+	audit_free_module(context);
 	audit_free_names(context);
 	unroll_tree_refs(context, NULL, 0);
 	free_tree_refs(context);
@@ -1266,7 +1274,6 @@ static void show_special(struct audit_context *context, int *call_panic)
 		audit_log_format(ab, "name=");
 		if (context->module.name) {
 			audit_log_untrustedstring(ab, context->module.name);
-			kfree(context->module.name);
 		} else
 			audit_log_format(ab, "(null)");
 
@@ -1697,6 +1704,7 @@ void __audit_syscall_exit(int success, long return_code)
 	context->in_syscall = 0;
 	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 
+	audit_free_module(context);
 	audit_free_names(context);
 	unroll_tree_refs(context, NULL, 0);
 	audit_free_aux(context);
-- 
cgit v1.2.3


From 8194fe94ab08f56ea55653df924647d23873d18c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 19 Mar 2019 10:45:09 -0700
Subject: kernel/workqueue: Document wq_worker_last_func() argument

This patch avoids that the following warning is reported when building
with W=1:

kernel/workqueue.c:938: warning: Function parameter or member 'task' not described in 'wq_worker_last_func'

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 56b7cf898f10..21721faa923c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -913,6 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
 
 /**
  * wq_worker_last_func - retrieve worker's last work function
+ * @task: Task to retrieve last work function of.
  *
  * Determine the last function a worker executed. This is called from
  * the scheduler to get a worker's last known identity.
-- 
cgit v1.2.3


From 73e65b88feb919f95bdb77c4ed35f69588cf27ee Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 19 Mar 2019 15:23:29 -0400
Subject: audit: connect LOGIN record to its syscall record

Currently the AUDIT_LOGIN event is a standalone record that isn't
connected to any other records that may be part of its syscall event. To
avoid the confusion of generating two events, connect the records by
using its syscall context.

Please see the github issue
https://github.com/linux-audit/audit-kernel/issues/110

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index c89ea48c70a6..b96bf69183f4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2220,7 +2220,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	if (!audit_enabled)
 		return;
 
-	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
 
-- 
cgit v1.2.3


From 2efa48fec0c344a6ca1bba66b15d63d38cf20199 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 20 Mar 2019 21:59:22 +0800
Subject: audit: Make audit_log_cap and audit_copy_inode static

Fix sparse warning:

kernel/auditsc.c:1150:6: warning: symbol 'audit_log_cap' was not declared. Should it be static?
kernel/auditsc.c:1908:6: warning: symbol 'audit_copy_inode' was not declared. Should it be static?

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fa7b8047aab8..17b0007fafc2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1147,7 +1147,8 @@ out:
 	kfree(buf_head);
 }
 
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+static void audit_log_cap(struct audit_buffer *ab, char *prefix,
+			  kernel_cap_t *cap)
 {
 	int i;
 
@@ -1905,8 +1906,9 @@ static inline int audit_copy_fcaps(struct audit_names *name,
 }
 
 /* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
-		      struct inode *inode, unsigned int flags)
+static void audit_copy_inode(struct audit_names *name,
+			     const struct dentry *dentry,
+			     struct inode *inode, unsigned int flags)
 {
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
-- 
cgit v1.2.3


From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 18 Mar 2019 02:30:18 +0300
Subject: syscall_get_arch: add "struct task_struct *" argument

This argument is required to extend the generic ptrace API with
PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going
to be called from ptrace_request() along with syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions with a tracee as their argument.

The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6)
should describe what system call is being called and what its arguments
are.

Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments")
Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()")
Reviewed-by: Andy Lutomirski <luto@kernel.org> # for x86
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Paul Burton <paul.burton@mips.com> # MIPS parts
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Kees Cook <keescook@chromium.org> # seccomp parts
Acked-by: Mark Salter <msalter@redhat.com> # for the c6x bit
Cc: Elvira Khabirova <lineprinter@altlinux.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: x86@kernel.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Cc: linux-hexagon@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-mips@vger.kernel.org
Cc: nios2-dev@lists.rocketboards.org
Cc: openrisc@lists.librecores.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-riscv@lists.infradead.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-um@lists.infradead.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linux-arch@vger.kernel.org
Cc: linux-audit@redhat.com
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 4 ++--
 kernel/seccomp.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 17b0007fafc2..98a98e6dca05 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 			return;
 	}
 
-	context->arch	    = syscall_get_arch();
+	context->arch	    = syscall_get_arch(current);
 	context->major      = major;
 	context->argv[0]    = a1;
 	context->argv[1]    = a2;
@@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)
 		return;
 	audit_log_task(ab);
 	audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
-			 signr, syscall_get_arch(), syscall,
+			 signr, syscall_get_arch(current), syscall,
 			 in_compat_syscall(), KSTK_EIP(current), code);
 	audit_log_end(ab);
 }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..36f36ab00f48 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 	unsigned long args[6];
 
 	sd->nr = syscall_get_nr(task, regs);
-	sd->arch = syscall_get_arch();
+	sd->arch = syscall_get_arch(task);
 	syscall_get_arguments(task, regs, 0, 6, args);
 	sd->args[0] = args[0];
 	sd->args[1] = args[1];
@@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason
 	info->si_code = SYS_SECCOMP;
 	info->si_call_addr = (void __user *)KSTK_EIP(current);
 	info->si_errno = reason;
-	info->si_arch = syscall_get_arch();
+	info->si_arch = syscall_get_arch(current);
 	info->si_syscall = syscall;
 }
 
-- 
cgit v1.2.3


From 0f3adc288df8ba2ac2fea0a8e4890f9fb4cd075d Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:53:59 +0800
Subject: bpf: track references based on is_acquire_func

So far, the verifier only acquires reference tracking state for
RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every
new return type which desires these semantics, acquire reference
tracking state iff the called helper is an acquire function.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86f9cd5d1c4e..868a82ad5597 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3147,19 +3147,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-		if (is_acquire_function(func_id)) {
-			int id = acquire_reference_state(env, insn_idx);
-
-			if (id < 0)
-				return id;
-			/* For mark_ptr_or_null_reg() */
-			regs[BPF_REG_0].id = id;
-			/* For release_reference() */
-			regs[BPF_REG_0].ref_obj_id = id;
-		} else {
-			/* For mark_ptr_or_null_reg() */
-			regs[BPF_REG_0].id = ++env->id_gen;
-		}
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
@@ -3170,9 +3158,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
-	if (is_ptr_cast_function(func_id))
+	if (is_ptr_cast_function(func_id)) {
 		/* For release_reference() */
 		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+	} else if (is_acquire_function(func_id)) {
+		int id = acquire_reference_state(env, insn_idx);
+
+		if (id < 0)
+			return id;
+		/* For mark_ptr_or_null_reg() */
+		regs[BPF_REG_0].id = id;
+		/* For release_reference() */
+		regs[BPF_REG_0].ref_obj_id = id;
+	}
 
 	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
 
-- 
cgit v1.2.3


From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:54:00 +0800
Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON

It's currently not possible to access timewait or request sockets
from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON
from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this
behaviour.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 868a82ad5597..a476e13201d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3148,6 +3148,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
+	} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
-- 
cgit v1.2.3


From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:54:01 +0800
Subject: bpf: add skc_lookup_tcp helper

Allow looking up a sock_common. This gives eBPF programs
access to timewait and request sockets.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a476e13201d6..dffeec3706ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id)
 static bool is_acquire_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_lookup_tcp ||
-		func_id == BPF_FUNC_sk_lookup_udp;
+		func_id == BPF_FUNC_sk_lookup_udp ||
+		func_id == BPF_FUNC_skc_lookup_tcp;
 }
 
 static bool is_ptr_cast_function(enum bpf_func_id func_id)
-- 
cgit v1.2.3


From d7dcf26ff0ffd7b56fe2b09ed7f1867589f3cdf1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2019 23:48:21 +0100
Subject: softirq: Remove tasklet_hrtimer

There are no more users of this interface.
Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Link: https://lkml.kernel.org/r/20190301224821.29843-4-bigeasy@linutronix.de
---
 kernel/softirq.c | 51 ---------------------------------------------------
 1 file changed, 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 10277429ed84..2c3382378d94 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(tasklet_kill);
 
-/*
- * tasklet_hrtimer
- */
-
-/*
- * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * hrtimer callback, but from softirq context.
- */
-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
-{
-	struct tasklet_hrtimer *ttimer =
-		container_of(timer, struct tasklet_hrtimer, timer);
-
-	tasklet_hi_schedule(&ttimer->tasklet);
-	return HRTIMER_NORESTART;
-}
-
-/*
- * Helper function which calls the hrtimer callback from
- * tasklet/softirq context
- */
-static void __tasklet_hrtimer_trampoline(unsigned long data)
-{
-	struct tasklet_hrtimer *ttimer = (void *)data;
-	enum hrtimer_restart restart;
-
-	restart = ttimer->function(&ttimer->timer);
-	if (restart != HRTIMER_NORESTART)
-		hrtimer_restart(&ttimer->timer);
-}
-
-/**
- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
- * @ttimer:	 tasklet_hrtimer which is initialized
- * @function:	 hrtimer callback function which gets called from softirq context
- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
- * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
- */
-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
-			  enum hrtimer_restart (*function)(struct hrtimer *),
-			  clockid_t which_clock, enum hrtimer_mode mode)
-{
-	hrtimer_init(&ttimer->timer, which_clock, mode);
-	ttimer->timer.function = __hrtimer_tasklet_trampoline;
-	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
-		     (unsigned long)ttimer);
-	ttimer->function = function;
-}
-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
-
 void __init softirq_init(void)
 {
 	int cpu;
-- 
cgit v1.2.3


From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Mar 2019 22:51:02 +0100
Subject: genetlink: make policy common to family

Since maxattr is common, the policy can't really differ sanely,
so make it common as well.

The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.

This reduces the size of e.g. nl80211.o (which has lots of commands):

   text	   data	    bss	    dec	    hex	filename
 398745	  14323	   2240	 415308	  6564c	net/wireless/nl80211.o (before)
 397913	  14331	   2240	 414484	  65314	net/wireless/nl80211.o (after)
--------------------------------
   -832      +8       0    -824

Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.

Most of the code transformations were done using the following spatch:
    @ops@
    identifier OPS;
    expression POLICY;
    @@
    struct genl_ops OPS[] = {
    ...,
     {
    -	.policy = POLICY,
     },
    ...
    };

    @@
    identifier ops.OPS;
    expression ops.POLICY;
    identifier fam;
    expression M;
    @@
    struct genl_family fam = {
            .ops = OPS,
            .maxattr = M,
    +       .policy = POLICY,
            ...
    };

This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4e62a4a8fa91..1b942a7caf26 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -650,16 +650,37 @@ static const struct genl_ops taskstats_ops[] = {
 	{
 		.cmd		= TASKSTATS_CMD_GET,
 		.doit		= taskstats_user_cmd,
-		.policy		= taskstats_cmd_get_policy,
-		.flags		= GENL_ADMIN_PERM,
+		/* policy enforced later */
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL,
 	},
 	{
 		.cmd		= CGROUPSTATS_CMD_GET,
 		.doit		= cgroupstats_user_cmd,
-		.policy		= cgroupstats_cmd_get_policy,
+		/* policy enforced later */
+		.flags		= GENL_CMD_CAP_HASPOL,
 	},
 };
 
+static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+			      struct genl_info *info)
+{
+	const struct nla_policy *policy = NULL;
+
+	switch (ops->cmd) {
+	case TASKSTATS_CMD_GET:
+		policy = taskstats_cmd_get_policy;
+		break;
+	case CGROUPSTATS_CMD_GET:
+		policy = cgroupstats_cmd_get_policy;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX,
+			      policy, info->extack);
+}
+
 static struct genl_family family __ro_after_init = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
@@ -667,6 +688,7 @@ static struct genl_family family __ro_after_init = {
 	.module		= THIS_MODULE,
 	.ops		= taskstats_ops,
 	.n_ops		= ARRAY_SIZE(taskstats_ops),
+	.pre_doit	= taskstats_pre_doit,
 };
 
 /* Needed early in initialization */
-- 
cgit v1.2.3


From e1e41b6ce5f9c1a80bf4f2404ec5ab11c6c5a2ad Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 18 Mar 2019 20:55:56 +0100
Subject: timekeeping: Consistently use unsigned int for seqcount snapshot

The timekeeping code uses a random mix of "unsigned long" and "unsigned
int" for the seqcount snapshots (ratio 14:12). Since the seqlock.h API is
entirely based on unsigned int, use that throughout.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@kernel.org>
Link: https://lkml.kernel.org/r/20190318195557.20773-1-linux@rasmusvillemoes.dk
---
 kernel/time/jiffies.c     |  2 +-
 kernel/time/sched_clock.c |  4 ++--
 kernel/time/tick-common.c |  2 +-
 kernel/time/tick-sched.c  |  3 ++-
 kernel/time/timekeeping.c | 18 +++++++++---------
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index dc1b6f1929f9..95f8f3304c19 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-	unsigned long seq;
+	unsigned int seq;
 	u64 ret;
 
 	do {
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..16b80c2b4fe8 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
-	unsigned long seq;
+	unsigned int seq;
 	struct clock_read_data *rd;
 
 	do {
@@ -267,7 +267,7 @@ void __init generic_sched_clock_init(void)
  */
 static u64 notrace suspended_sched_clock_read(void)
 {
-	unsigned long seq = raw_read_seqcount(&cd.seq);
+	unsigned int seq = raw_read_seqcount(&cd.seq);
 
 	return cd.read_data[seq & 1].epoch_cyc;
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..561641b2153f 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -149,7 +149,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 	    !tick_broadcast_oneshot_active()) {
 		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 	} else {
-		unsigned long seq;
+		unsigned int seq;
 		ktime_t next;
 
 		do {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..b50f6f22c88e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -645,7 +645,8 @@ static inline bool local_timer_softirq_pending(void)
 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
 	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
-	unsigned long seq, basejiff;
+	unsigned long basejiff;
+	unsigned int seq;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f986e1918d12..540145da33da 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 void ktime_get_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
@@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
 ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
 {
 	ktime_t *offset = offsets[offs];
-	unsigned long seq;
+	unsigned int seq;
 	ktime_t tconv;
 
 	do {
@@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void)
 void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 	ktime_t base_raw;
 	ktime_t base_real;
 	u64 nsec_raw;
@@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	ktime_t base_real, base_raw;
 	u64 nsec_real, nsec_raw;
 	u8 cs_was_changed_seq;
-	unsigned long seq;
+	unsigned int seq;
 	bool do_interp;
 	int ret;
 
@@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock)
 void ktime_get_raw_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 	u64 nsecs;
 
 	do {
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);
 int timekeeping_valid_for_hres(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 	int ret;
 
 	do {
@@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void)
 u64 timekeeping_max_deferment(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 	u64 ret;
 
 	do {
@@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64);
 void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
+	unsigned int seq;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
@@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 now, mono;
-	unsigned long seq;
+	unsigned int seq;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-- 
cgit v1.2.3


From 1b72d43237980eab9b6ae6bb8181e51c840377e6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Mar 2019 16:39:20 +0100
Subject: tick: Remove outgoing CPU from broadcast masks

Valentin reported that unplugging a CPU occasionally results in a warning
in the tick broadcast code which is triggered when an offline CPU is in the
broadcast mask.

This happens because the outgoing CPU is not removing itself from the
broadcast masks, especially not from the broadcast_force_mask. The removal
happens on the control CPU after the outgoing CPU is dead. It's a long
standing issue, but the warning is harmless.

Rework the hotplug mechanism so that the outgoing CPU removes itself from
the broadcast masks after disabling interrupts and removing itself from the
online mask.

Reported-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903211540180.1784@nanos.tec.linutronix.de
---
 kernel/cpu.c                 |  2 ++
 kernel/time/clockevents.c    | 18 ++++++++++++++++--
 kernel/time/tick-broadcast.c | 40 +++++++++++++++++++---------------------
 kernel/time/tick-internal.h  | 10 ++++++----
 4 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 025f419d16f6..f69ba38573c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -844,6 +844,8 @@ static int take_cpu_down(void *_param)
 
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
+	/* Remove CPU from timer broadcasting */
+	tick_offline_cpu(cpu);
 	/* Park the stopper thread */
 	stop_machine_park(cpu);
 	return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e77662dd2d9..f5490222e134 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -611,6 +611,22 @@ void clockevents_resume(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+/**
+ * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * @cpu:	The outgoing CPU
+ *
+ * Called on the outgoing CPU after it took itself offline.
+ */
+void tick_offline_cpu(unsigned int cpu)
+{
+	raw_spin_lock(&clockevents_lock);
+	tick_broadcast_offline(cpu);
+	raw_spin_unlock(&clockevents_lock);
+}
+# endif
+
 /**
  * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
  */
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)
 
 	raw_spin_lock_irqsave(&clockevents_lock, flags);
 
-	tick_shutdown_broadcast_oneshot(cpu);
-	tick_shutdown_broadcast(cpu);
 	tick_shutdown(cpu);
 	/*
 	 * Unregister the clock event devices which were
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ee834d4fb814..0283523de045 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,10 +36,12 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
 static void tick_broadcast_clear_oneshot(int cpu);
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+static void tick_broadcast_oneshot_offline(unsigned int cpu);
 #else
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
 static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
 #endif
 
 /*
@@ -433,27 +435,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Remove a CPU from broadcasting
- */
-void tick_shutdown_broadcast(unsigned int cpu)
+static void tick_shutdown_broadcast(void)
 {
-	struct clock_event_device *bc;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
-	bc = tick_broadcast_device.evtdev;
-	cpumask_clear_cpu(cpu, tick_broadcast_mask);
-	cpumask_clear_cpu(cpu, tick_broadcast_on);
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
 		if (bc && cpumask_empty(tick_broadcast_mask))
 			clockevents_shutdown(bc);
 	}
+}
 
-	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_broadcast_offline(unsigned int cpu)
+{
+	raw_spin_lock(&tick_broadcast_lock);
+	cpumask_clear_cpu(cpu, tick_broadcast_mask);
+	cpumask_clear_cpu(cpu, tick_broadcast_on);
+	tick_broadcast_oneshot_offline(cpu);
+	tick_shutdown_broadcast();
+	raw_spin_unlock(&tick_broadcast_lock);
 }
+
 #endif
 
 void tick_suspend_broadcast(void)
@@ -950,14 +954,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
 }
 
 /*
- * Remove a dead CPU from broadcasting
+ * Remove a dying CPU from broadcasting
  */
-void tick_shutdown_broadcast_oneshot(unsigned int cpu)
+static void tick_broadcast_oneshot_offline(unsigned int cpu)
 {
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
 	/*
 	 * Clear the broadcast masks for the dead cpu, but do not stop
 	 * the broadcast device!
@@ -965,8 +965,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)
 	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
 	cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
-	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 #endif
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..7b2496136729 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
 extern void tick_install_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
 extern void tick_suspend_broadcast(void);
 extern void tick_resume_broadcast(void);
 extern bool tick_resume_check_broadcast(void);
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)
 static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
 static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
 static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline void tick_resume_broadcast(void) { }
 static inline bool tick_resume_check_broadcast(void) { return false; }
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 /* Functions related to oneshot broadcasting */
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
 extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
 #else /* !(BROADCAST && ONESHOT): */
 static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
 #endif /* !(BROADCAST && ONESHOT) */
 
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
+extern void tick_broadcast_offline(unsigned int cpu);
+#else
+static inline void tick_broadcast_offline(unsigned int cpu) { }
+#endif
+
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
 extern void tick_nohz_init(void);
-- 
cgit v1.2.3


From d6b87eaf10bd061914f6d277d7428b3285d8850e Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Thu, 21 Mar 2019 13:09:18 +0100
Subject: tick/sched: Update tick_sched struct documentation

Adapt the documentation order of struct members to the effective order of
struct members and add missing descriptions.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Link: https://lkml.kernel.org/r/20190321120921.16463-2-anna-maria@linutronix.de
---
 kernel/time/tick-sched.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 6de959a854b2..4fb06527cf64 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -24,12 +24,19 @@ enum tick_nohz_mode {
  * struct tick_sched - sched tick emulation and no idle tick control/stats
  * @sched_timer:	hrtimer to schedule the periodic tick in high
  *			resolution mode
+ * @check_clocks:	Notification mechanism about clocksource changes
+ * @nohz_mode:		Mode - one state of tick_nohz_mode
+ * @inidle:		Indicator that the CPU is in the tick idle mode
+ * @tick_stopped:	Indicator that the idle tick has been stopped
+ * @idle_active:	Indicator that the CPU is actively in the tick idle mode;
+ *			it is resetted during irq handling phases.
+ * @do_timer_lst:	CPU was the last one doing do_timer before going idle
+ * @got_idle_tick:	Tick timer function has run with @inidle set
  * @last_tick:		Store the last tick expiry time when the tick
  *			timer is modified for nohz sleeps. This is necessary
  *			to resume the tick timer operation in the timeline
  *			when the CPU returns from nohz sleep.
  * @next_tick:		Next tick to be fired when in dynticks mode.
- * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_jiffies:	jiffies at the entry to idle for idle time accounting
  * @idle_calls:		Total number of idle calls
  * @idle_sleeps:	Number of idle calls, where the sched tick was stopped
@@ -40,8 +47,8 @@ enum tick_nohz_mode {
  * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)
  * @timer_expires_base:	Base time clock monotonic for @timer_expires
- * @do_timer_lst:	CPU was the last one doing do_timer before going idle
- * @got_idle_tick:	Tick timer function has run with @inidle set
+ * @next_timer:		Expiry time of next expiring timer for debugging purpose only
+ * @tick_dep_mask:	Tick dependency mask - is set, if someone needs the tick
  */
 struct tick_sched {
 	struct hrtimer			sched_timer;
-- 
cgit v1.2.3


From dc1e7dc5ac6254ba0502323381a7ec847e408f1d Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Thu, 21 Mar 2019 13:09:19 +0100
Subject: timer: Move trace point to get proper index

When placing the timer_start trace point before the timer wheel bucket
index is calculated, the index information in the trace point is useless.

It is not possible to simply move the debug_activate() call after the index
calculation, because debug_object_activate() needs to be called before
touching the object.

Therefore split debug_activate() and move the trace point into
enqueue_timer() after the new index has been calculated. The
debug_object_activate() call remains at the original place.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20190321120921.16463-3-anna-maria@linutronix.de
---
 kernel/time/timer.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..8d7918ae4d0c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
 	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
+
+	trace_timer_start(timer, timer->expires, timer->flags);
 }
 
 static void
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)
 	trace_timer_init(timer);
 }
 
-static inline void
-debug_activate(struct timer_list *timer, unsigned long expires)
-{
-	debug_timer_activate(timer);
-	trace_timer_start(timer, expires, timer->flags);
-}
-
 static inline void debug_deactivate(struct timer_list *timer)
 {
 	debug_timer_deactivate(timer);
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 		}
 	}
 
-	debug_activate(timer, expires);
+	debug_timer_activate(timer);
 
 	timer->expires = expires;
 	/*
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	}
 	forward_timer_base(base);
 
-	debug_activate(timer, timer->expires);
+	debug_timer_activate(timer);
 	internal_add_timer(base, timer);
 	raw_spin_unlock_irqrestore(&base->lock, flags);
 }
-- 
cgit v1.2.3


From f28d3d5346e97e60c81f933ac89ccf015430e5cf Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Thu, 21 Mar 2019 13:09:21 +0100
Subject: timer/trace: Improve timer tracing

Timers are added to the timer wheel off by one. This is required in
case a timer is queued directly before incrementing jiffies to prevent
early timer expiry.

When reading a timer trace and relying only on the expiry time of the timer
in the timer_start trace point and on the now in the timer_expiry_entry
trace point, it seems that the timer fires late. With the current
timer_expiry_entry trace point information only now=jiffies is printed but
not the value of base->clk. This makes it impossible to draw a conclusion
to the index of base->clk and makes it impossible to examine timer problems
without additional trace points.

Therefore add the base->clk value to the timer_expire_entry trace
point, to be able to calculate the index the timer base is located at
during collecting expired timers.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20190321120921.16463-5-anna-maria@linutronix.de
---
 kernel/time/timer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8d7918ae4d0c..a9b1bbc2d88d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1293,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
+static void call_timer_fn(struct timer_list *timer,
+			  void (*fn)(struct timer_list *),
+			  unsigned long baseclk)
 {
 	int count = preempt_count();
 
@@ -1316,7 +1318,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
 	 */
 	lock_map_acquire(&lockdep_map);
 
-	trace_timer_expire_entry(timer);
+	trace_timer_expire_entry(timer, baseclk);
 	fn(timer);
 	trace_timer_expire_exit(timer);
 
@@ -1337,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
 
 static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
+	/*
+	 * This value is required only for tracing. base->clk was
+	 * incremented directly before expire_timers was called. But expiry
+	 * is related to the old base->clk value.
+	 */
+	unsigned long baseclk = base->clk - 1;
+
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
 		void (*fn)(struct timer_list *);
@@ -1350,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
-			call_timer_fn(timer, fn);
+			call_timer_fn(timer, fn, baseclk);
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
-			call_timer_fn(timer, fn);
+			call_timer_fn(timer, fn, baseclk);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
-- 
cgit v1.2.3


From 59c39840f5abf4a71e1810a8da71aaccd6c17d26 Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Sun, 24 Mar 2019 07:57:04 -0700
Subject: genirq: Prevent use-after-free and work list corruption

When irq_set_affinity_notifier() replaces the notifier, then the
reference count on the old notifier is dropped which causes it to be
freed. But nothing ensures that the old notifier is not longer queued
in the work list. If it is queued this results in a use after free and
possibly in work list corruption.

Ensure that the work is canceled before the reference is dropped.

Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: marc.zyngier@arm.com
Link: https://lkml.kernel.org/r/1553439424-6529-1-git-send-email-psodagud@codeaurora.org
---
 kernel/irq/manage.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1401afa0d58a..53a081392115 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	desc->affinity_notify = notify;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
-	if (old_notify)
+	if (old_notify) {
+		cancel_work_sync(&old_notify->work);
 		kref_put(&old_notify->kref, old_notify->release);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From e85e6a21b2b5f31148cc3f2e785262b37c3e1ec7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 10 Jan 2019 15:30:15 -0800
Subject: rcu: Unconditionally expedite during suspend/hibernate

The rcu_pm_notify() function refuses to switch to/from expedited grace
periods on systems with more than 256 CPUs due to the serialized
initialization of expedited grace periods.  However, expedited grace
periods are now initialized in parallel, removing this concern.
This commit therefore removes the checks from rcu_pm_notify(), so that
expedited grace periods are used unconditionally during suspend/resume
and hibernate/wake operations.

As always, real-time workloads wishing to completely avoid expedited
grace periods can use the rcupdate.rcu_normal= kernel parameter.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..95e3250b7b6e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3559,13 +3559,11 @@ static int rcu_pm_notify(struct notifier_block *self,
 	switch (action) {
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
-		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
-			rcu_expedite_gp();
+		rcu_expedite_gp();
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
-		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
-			rcu_unexpedite_gp();
+		rcu_unexpedite_gp();
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 671a63517cf983ad8eaa324167165cef245ab744 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 19 Jan 2019 11:14:18 -0500
Subject: rcu: Avoid unnecessary softirq when system is idle

When there are no callbacks pending on an idle system, I noticed that
RCU softirq is continuously firing. During this the cpu_no_qs is set to
false, and core_needs_qs is set to true indefinitely. This causes
rcu_process_callbacks to be repeatedly called, even though the node
corresponding to the CPU has that CPU's mask bit cleared and the system
is idle. I believe the race is when such mask clearing is done during
idle CPU scan of the quiescent state forcing stage in the kthread
instead of the softirq. Since the rnp mask is cleared, but the flags on
the CPU's rdp are not cleared, the CPU thinks it still needs to report
to core RCU.

Cure this by clearing the core_needs_qs flag when the CPU detects that
its node is already updated which will avoid the unwanted softirq raises
to the benefit of real-time systems.

Test: Ran rcutorture for various tree RCU configs.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 95e3250b7b6e..2f78a115d34c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2296,6 +2296,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
 	}
 	mask = rdp->grpmask;
 	if ((rnp->qsmask & mask) == 0) {
+		rdp->core_needs_qs = false;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	} else {
 		rdp->core_needs_qs = false;
-- 
cgit v1.2.3


From 18d7e40679ef574e428f893101be1c0035e95ee3 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 24 Jan 2019 21:14:37 +0300
Subject: rcu: rcu_qs -- Use raise_softirq_irqoff to not save irqs twice

The rcu_qs is disabling IRQs by self so no need to do the same in raise_softirq
but instead we can save some cycles using raise_softirq_irqoff directly.

CC: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tiny.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 911bd9076d43..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -52,7 +52,7 @@ void rcu_qs(void)
 	local_irq_save(flags);
 	if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
 		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
-		raise_softirq(RCU_SOFTIRQ);
+		raise_softirq_irqoff(RCU_SOFTIRQ);
 	}
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 884157cef0acf05648fe921d80c680afababb428 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 11 Feb 2019 07:21:29 -0800
Subject: rcu: Make exit_rcu() handle non-preempted RCU readers

The purpose of exit_rcu() is to handle cases where buggy code causes a
task to exit within an RCU read-side critical section.  It currently
does that in the case where said RCU read-side critical section was
preempted at least once, but fails to handle cases where preemption did
not occur.  This case needs to be handled because otherwise the final
context switch away from the exiting task will incorrectly behave as if
task exit were instead a preemption of an RCU read-side critical section,
and will therefore queue the exiting task.  The exiting task will have
exited, and thus won't ever execute rcu_read_unlock(), which means that
it will remain queued forever, blocking all subsequent grace periods,
and eventually resulting in OOM.

Although this is arguably better than letting grace periods proceed
and having a later rcu_read_unlock() access the now-freed task
structure that once belonged to the exiting tasks, it would obviously
be better to correctly handle this case.  This commit therefore sets
->rcu_read_lock_nesting to 1 in that case, so that the subsequence call
to __rcu_read_unlock() causes the exiting task to exit its dangling RCU
read-side critical section.

Note that deferred quiescent states need not be considered.  The reason
is that removing the task from the ->blkd_tasks[] list in the call to
rcu_preempt_deferred_qs() handles the per-task component of any deferred
quiescent state, and all other components of any deferred quiescent state
are associated with the CPU, which isn't going anywhere until some later
CPU-hotplug operation, which will report any remaining deferred quiescent
states from within the rcu_report_dead() function.

Note also that negative values of ->rcu_read_lock_nesting need not be
considered.  First, these won't show up in exit_rcu() unless there is
a serious bug in RCU, and second, setting ->rcu_read_lock_nesting sets
the state so that the RCU read-side critical section will be exited
normally.

Again, this code has no effect unless there has been some prior bug
that prevents a task from leaving an RCU read-side critical section
before exiting.  Furthermore, there have been no reports of the bug
fixed by this commit appearing in production.  This commit is therefore
absolutely -not- recommended for backporting to -stable.

Reported-by: ABHISHEK DUBEY <dabhishek@iisc.ac.in>
Reported-by: BHARATH Y MOURYA <bharathm@iisc.ac.in>
Reported-by: Aravinda Prasad <aravinda@iisc.ac.in>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Tested-by: ABHISHEK DUBEY <dabhishek@iisc.ac.in>
---
 kernel/rcu/tree_plugin.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..d408661d5fb7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -804,19 +804,25 @@ static void rcu_flavor_sched_clock_irq(int user)
 
 /*
  * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
+ * critical section, clean up if so.  No need to issue warnings, as
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort.  Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
  */
 void exit_rcu(void)
 {
 	struct task_struct *t = current;
 
-	if (likely(list_empty(&current->rcu_node_entry)))
+	if (unlikely(!list_empty(&current->rcu_node_entry))) {
+		t->rcu_read_lock_nesting = 1;
+		barrier();
+		t->rcu_read_unlock_special.b.blocked = true;
+	} else if (unlikely(t->rcu_read_lock_nesting)) {
+		t->rcu_read_lock_nesting = 1;
+	} else {
 		return;
-	t->rcu_read_lock_nesting = 1;
-	barrier();
-	t->rcu_read_unlock_special.b.blocked = true;
+	}
 	__rcu_read_unlock();
 	rcu_preempt_deferred_qs(current);
 }
-- 
cgit v1.2.3


From 3ffe3d1adc0b6cfb9b24db9995a537bb0aa30a8b Mon Sep 17 00:00:00 2001
From: Liu Song <fishland@aliyun.com>
Date: Thu, 21 Feb 2019 22:13:27 +0800
Subject: rcu: Set rcutree.kthread_prio sysfs access to read-only

The rcutree.kthread_prio kernel-boot parameter is used to set the
priority for boost (rcub), per-CPU (rcuc), and grace-period (rcu_preempt
or rcu_sched) kthreads.  It is also used by rcutorture to check whether
it is possible to meaningfully test RCU priority boosting.  However,
all of these cases will either ignore or be confused by any post-boot
changes to rcutree.kthread_prio.

Note that the user really can change the priorities of all of these
kthreads using chrt, given sufficient privileges.  Therefore, the
read-write nature of sysfs access to rcutree.kthread_prio is thus at
best an attractive nuisance.

This commit therefore changes sysfs access to rcutree.kthread_prio to
be read-only.

Signed-off-by: Liu Song <liu.song11@zte.com.cn>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2f78a115d34c..296131450414 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -149,7 +149,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
 
 /* rcuc/rcub kthread realtime priority */
 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-module_param(kthread_prio, int, 0644);
+module_param(kthread_prio, int, 0444);
 
 /* Delay in jiffies for grace-period initialization delays, debug only. */
 
-- 
cgit v1.2.3


From b2eb85b49a576515fb845cb12568b173c2bedffc Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sat, 2 Mar 2019 17:25:19 +0900
Subject: rcu: Move common code out of if-else block

As the result of recent addition of "rdp->core_needs_qs = false;" in
the "if" block, now both branches of the if-else have the same
assignment.

Factor it out and reduce line count.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 296131450414..5aefd36ac648 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2295,12 +2295,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
 		return;
 	}
 	mask = rdp->grpmask;
+	rdp->core_needs_qs = false;
 	if ((rnp->qsmask & mask) == 0) {
-		rdp->core_needs_qs = false;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	} else {
-		rdp->core_needs_qs = false;
-
 		/*
 		 * This GP can't end until cpu checks in, so all of our
 		 * callbacks can be processed during the next GP.
-- 
cgit v1.2.3


From da8739f23fadf05809c6c37c327367b229467045 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 5 Mar 2019 15:28:19 -0800
Subject: rcu: Allow rcu_nocbs= to specify all CPUs

Currently, the rcu_nocbs= kernel boot parameter requires that a specific
list of CPUs be specified, and has no way to say "all of them".
As noted by user RavFX in a comment to Phoronix topic 1002538, this
is an inconvenient side effect of the removal of the RCU_NOCB_CPU_ALL
Kconfig option.  This commit therefore enables the rcu_nocbs= kernel boot
parameter to be given the string "all", as in "rcu_nocbs=all" to specify
that all CPUs on the system are to have their RCU callbacks offloaded.

Another approach would be to make cpulist_parse() check for "all", but
there are uses of cpulist_parse() that do other checking, which could
conflict with an "all".  This commit therefore focuses on the specific
use of cpulist_parse() in rcu_nocb_setup().

Just a note to other people who would like changes to Linux-kernel RCU:
If you send your requests to me directly, they might get fixed somewhat
faster.  RavFX's comment was posted on January 22, 2018 and I first saw
it on March 5, 2019.  And the only reason that I found it -at- -all- was
that I was looking for projects using RCU, and my search engine showed
me that Phoronix comment quite by accident.  Your choice, though!  ;-)

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d408661d5fb7..ed4a6dabf31d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1776,7 +1776,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 static int __init rcu_nocb_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-	cpulist_parse(str, rcu_nocb_mask);
+	if (!strcasecmp(str, "all"))
+		cpumask_setall(rcu_nocb_mask);
+	else
+		cpulist_parse(str, rcu_nocb_mask);
 	return 1;
 }
 __setup("rcu_nocbs=", rcu_nocb_setup);
-- 
cgit v1.2.3


From 497e42600b69aca1b799c840d2cfc7ad60bb8017 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 6 Mar 2019 14:47:56 -0800
Subject: rcu: Report error for bad rcu_nocbs= parameter values

This commit prints a console message when cpulist_parse() reports a
bad list of CPUs, and sets all CPUs' bits in that case.  The reason for
setting all CPUs' bits is that this is the safe(r) choice for real-time
workloads, which would normally be the ones using the rcu_nocbs= kernel
boot parameter.  Either way, later RCU console log messages list the
actual set of CPUs whose RCU callbacks will be offloaded.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ed4a6dabf31d..f0aeb7416dcc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1772,14 +1772,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
  */
 
 
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges.  If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
 static int __init rcu_nocb_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
 	if (!strcasecmp(str, "all"))
 		cpumask_setall(rcu_nocb_mask);
 	else
-		cpulist_parse(str, rcu_nocb_mask);
+		if (cpulist_parse(str, rcu_nocb_mask)) {
+			pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+			cpumask_setall(rcu_nocb_mask);
+		}
 	return 1;
 }
 __setup("rcu_nocbs=", rcu_nocb_setup);
-- 
cgit v1.2.3


From 0f58d2ac2c87d27006c2b610668ebc4ff1f7c3ba Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Fri, 8 Mar 2019 15:16:18 +0530
Subject: rcu: Fix self-wakeups for grace-period kthread

The current rcu_gp_kthread_wake() function uses in_interrupt()
and thus does a self-wakeup from all interrupt contexts, including
the pointless case where the GP kthread happens to be running with
bottom halves disabled, along with the impossible case where the GP
kthread is running within an NMI handler (you are not supposed to invoke
rcu_gp_kthread_wake() from within an NMI handler.  This commit therefore
replaces the in_interrupt() with in_irq(), so that the self-wakeups
happen only from handlers for hardware interrupts and softirqs.
This also makes the code match the comment.

Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5aefd36ac648..139fa1f5c537 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1585,7 +1585,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
 static void rcu_gp_kthread_wake(void)
 {
 	if ((current == rcu_state.gp_kthread &&
-	     !in_interrupt() && !in_serving_softirq()) ||
+	     !in_irq() && !in_serving_softirq()) ||
 	    !READ_ONCE(rcu_state.gp_flags) ||
 	    !rcu_state.gp_kthread)
 		return;
-- 
cgit v1.2.3


From 6973032a602ee678c98644a30d57ebf9c72dd6d3 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Mon, 11 Mar 2019 15:16:11 +0530
Subject: rcu: Default jiffies_to_sched_qs to jiffies_till_sched_qs

The current code only calls adjust_jiffies_till_sched_qs() if
jiffies_till_sched_qs is left at its default value, so when the
jiffies_till_sched_qs kernel-boot parameter actually is specified,
jiffies_to_sched_qs will be left with the value zero, which
will result in useless slowdowns of cond_resched().  This commit
therefore changes rcu_init_geometry() to unconditionally invoke
adjust_jiffies_till_sched_qs(), which ensures that jiffies_to_sched_qs
will be initialized in all cases, thus maintaining good cond_resched()
performance.

Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 139fa1f5c537..466299c3d2da 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3739,8 +3739,7 @@ static void __init rcu_init_geometry(void)
 		jiffies_till_first_fqs = d;
 	if (jiffies_till_next_fqs == ULONG_MAX)
 		jiffies_till_next_fqs = d;
-	if (jiffies_till_sched_qs == ULONG_MAX)
-		adjust_jiffies_till_sched_qs();
+	adjust_jiffies_till_sched_qs();
 
 	/* If the compile-time values are accurate, just leave. */
 	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
-- 
cgit v1.2.3


From 85f2b60c4321b088ba08ec9a05b8a7b68e4ada2a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 11 Mar 2019 15:45:13 -0700
Subject: rcu: Update jiffies_to_sched_qs and adjust_jiffies_till_sched_qs()
 comments

This commit better documents the jiffies_to_sched_qs default-value
strategy used by adjust_jiffies_till_sched_qs()

Reported-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 466299c3d2da..e117732bcd5d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -406,7 +406,7 @@ static bool rcu_kick_kthreads;
  */
 static ulong jiffies_till_sched_qs = ULONG_MAX;
 module_param(jiffies_till_sched_qs, ulong, 0444);
-static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
 module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
 
 /*
@@ -424,6 +424,7 @@ static void adjust_jiffies_till_sched_qs(void)
 		WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
 		return;
 	}
+	/* Otherwise, set to third fqs scan, but bound below on large system. */
 	j = READ_ONCE(jiffies_till_first_fqs) +
 		      2 * READ_ONCE(jiffies_till_next_fqs);
 	if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
-- 
cgit v1.2.3


From 5d8a752e31aaa4c9703f201956a40b45ed791217 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <zhouzhouyi@gmail.com>
Date: Wed, 20 Mar 2019 03:33:00 +0000
Subject: rcu: Fix force_qs_rnp() header comment

Previously, threads blocked on offlining CPUS were migrated to the
root rcu_node structure, thus requiring RCU priority boosting on this
structure.  However, since commit d19fb8d1f3f6 ("rcu: Don't migrate
blocked tasks even if all corresponding CPUs offline"), RCU does not
migrate blocked tasks.  Consequently, RCU no longer does RCU priority
boosting on the root rcu_node structure as of commit 1be0085b515e ("rcu:
Don't initiate RCU priority boosting on root rcu_node").

This commit therefore brings comments for the force_qs_rnp() function's
header comment in line with this new no-root-boosting reality.

Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
[ paulmck: Also remove obsolete comment on suppressing new grace periods. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e117732bcd5d..abc8512ceb5f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2548,11 +2548,11 @@ void rcu_sched_clock_irq(int user)
 }
 
 /*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
+ * Scan the leaf rcu_node structures.  For each structure on which all
+ * CPUs have reported a quiescent state and on which there are tasks
+ * blocking the current grace period, initiate RCU priority boosting.
+ * Otherwise, invoke the specified function to check dyntick state for
+ * each CPU that has not yet reported a quiescent state.
  */
 static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 {
-- 
cgit v1.2.3


From a2badefa8574a7a424a72f67a3bd43248141f76a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 21 Mar 2019 16:29:50 -0700
Subject: rcu: Eliminate redundant NULL-pointer check

Because rcu_wake_cond() checks for a null task_struct pointer, there is
no need for its callers to do so.  This commit eliminates the redundant
check.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f0aeb7416dcc..81d3cd821891 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1191,8 +1191,6 @@ static int rcu_boost_kthread(void *arg)
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
-	struct task_struct *t;
-
 	raw_lockdep_assert_held_rcu_node(rnp);
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1206,9 +1204,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 		if (rnp->exp_tasks == NULL)
 			rnp->boost_tasks = rnp->gp_tasks;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		t = rnp->boost_kthread_task;
-		if (t)
-			rcu_wake_cond(t, rnp->boost_kthread_status);
+		rcu_wake_cond(rnp->boost_kthread_task,
+			      rnp->boost_kthread_status);
 	} else {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
-- 
cgit v1.2.3


From f1a98045abd824df833354b309b5fa64ff95f792 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Sat, 23 Mar 2019 09:19:12 -0700
Subject: rcu: Fix typo in tree_exp.h comment

This commit changes a rcu_exp_handler() comment from rcu_preempt_defer_qs()
to rcu_preempt_deferred_qs() in order to better match reality.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..ec4fb93a5dbe 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -648,7 +648,7 @@ static void rcu_exp_handler(void *unused)
 	 *
 	 * If the CPU is fully enabled (or if some buggy RCU-preempt
 	 * read-side critical section is being used from idle), just
-	 * invoke rcu_preempt_defer_qs() to immediately report the
+	 * invoke rcu_preempt_deferred_qs() to immediately report the
 	 * quiescent state.  We cannot use rcu_read_unlock_special()
 	 * because we are in an interrupt handler, which will cause that
 	 * function to take an early exit without doing anything.
-- 
cgit v1.2.3


From add0d37b4f1e77de7d170ece43c8d765572a1eab Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 26 Mar 2019 10:22:22 -0700
Subject: rcu: Correct READ_ONCE()/WRITE_ONCE() for ->rcu_read_unlock_special

The task_struct structure's ->rcu_read_unlock_special field is only ever
read or written by the owning task, but it is accessed both at process
and interrupt levels.  It may therefore be accessed using plain reads
and writes while interrupts are disabled, but must be accessed using
READ_ONCE() and WRITE_ONCE() or better otherwise.  This commit makes a
few adjustments to align with this discipline.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_exp.h    | 2 +-
 kernel/rcu/tree_plugin.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index ec4fb93a5dbe..1ee0782213b8 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -633,7 +633,7 @@ static void rcu_exp_handler(void *unused)
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->expmask & rdp->grpmask) {
 			rdp->deferred_qs = true;
-			WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+			t->rcu_read_unlock_special.b.exp_hint = true;
 		}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 81d3cd821891..6ddb3c05e88f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -285,7 +285,7 @@ static void rcu_qs(void)
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
 		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
-		current->rcu_read_unlock_special.b.need_qs = false;
+		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
 	}
 }
 
@@ -817,7 +817,7 @@ void exit_rcu(void)
 	if (unlikely(!list_empty(&current->rcu_node_entry))) {
 		t->rcu_read_lock_nesting = 1;
 		barrier();
-		t->rcu_read_unlock_special.b.blocked = true;
+		WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
 	} else if (unlikely(t->rcu_read_lock_nesting)) {
 		t->rcu_read_lock_nesting = 1;
 	} else {
-- 
cgit v1.2.3


From 5cdfd174ea6c2dc1d331b61bdc9572698658600a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 12 Feb 2019 10:44:33 -0800
Subject: srcu: Check for in-flight callbacks in _cleanup_srcu_struct()

If someone fails to drain the corresponding SRCU callbacks (for
example, by failing to invoke srcu_barrier()) before invoking either
cleanup_srcu_struct() or cleanup_srcu_struct_quiesced(), the resulting
diagnostic is an ambiguous use-after-free diagnostic, and even then
only if you are running something like KASAN.  This commit therefore
improves SRCU diagnostics by adding checks for in-flight callbacks at
_cleanup_srcu_struct() time.

Note that these diagnostics can still be defeated, for example, by
invoking call_srcu() concurrently with cleanup_srcu_struct().  Which is
a really bad idea, but sometimes all too easy to do.  But even then,
these diagnostics have at least some probability of catching the problem.

Reported-by: Sagi Grimberg <sagi@grimberg.me>
Reported-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Tested-by: Bart Van Assche <bvanassche@acm.org>
---
 kernel/rcu/srcutree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..4f30f3ecabc1 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -387,6 +387,8 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
 			del_timer_sync(&sdp->delay_work);
 			flush_work(&sdp->work);
 		}
+		if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
+			return; /* Forgot srcu_barrier(), so just leak it! */
 	}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(ssp))) {
-- 
cgit v1.2.3


From f5ad3991493c69d203d42b94d32349b54c58a3f1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Wed, 13 Feb 2019 13:54:37 -0800
Subject: srcu: Remove cleanup_srcu_struct_quiesced()

The cleanup_srcu_struct_quiesced() function was added because NVME
used WQ_MEM_RECLAIM workqueues and SRCU did not, which meant that
NVME workqueues waiting on SRCU workqueues could result in deadlocks
during low-memory conditions.  However, SRCU now also has WQ_MEM_RECLAIM
workqueues, so there is no longer a potential for deadlock.  Furthermore,
it turns out to be extremely hard to use cleanup_srcu_struct_quiesced()
correctly due to the fact that SRCU callback invocation accesses the
srcu_struct structure's per-CPU data area just after callbacks are
invoked.  Therefore, the usual practice of using srcu_barrier() to wait
for callbacks to be invoked before invoking cleanup_srcu_struct_quiesced()
fails because SRCU's callback-invocation workqueue handler might be
delayed, which can result in cleanup_srcu_struct_quiesced() being invoked
(and thus freeing the per-CPU data) before the SRCU's callback-invocation
workqueue handler is finished using that per-CPU data.  Nor is this a
theoretical problem: KASAN emitted use-after-free warnings because of
this problem on actual runs.

In short, NVME can now safely invoke cleanup_srcu_struct(), which
avoids the use-after-free scenario.  And cleanup_srcu_struct_quiesced()
is quite difficult to use safely.  This commit therefore removes
cleanup_srcu_struct_quiesced(), switching its sole user back to
cleanup_srcu_struct().  This effectively reverts the following pair
of commits:

f7194ac32ca2 ("srcu: Add cleanup_srcu_struct_quiesced()")
4317228ad9b8 ("nvme: Avoid flush dependency in delete controller flow")

Reported-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Bart Van Assche <bvanassche@acm.org>
---
 kernel/rcu/rcutorture.c |  7 +------
 kernel/rcu/srcutiny.c   |  9 +++------
 kernel/rcu/srcutree.c   | 30 ++++++++++++------------------
 3 files changed, 16 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..d2b226110835 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -592,12 +592,7 @@ static void srcu_torture_init(void)
 
 static void srcu_torture_cleanup(void)
 {
-	static DEFINE_TORTURE_RANDOM(rand);
-
-	if (torture_random(&rand) & 0x800)
-		cleanup_srcu_struct(&srcu_ctld);
-	else
-		cleanup_srcu_struct_quiesced(&srcu_ctld);
+	cleanup_srcu_struct(&srcu_ctld);
 	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5d4a39a6505a..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+void cleanup_srcu_struct(struct srcu_struct *ssp)
 {
 	WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
-	if (quiesced)
-		WARN_ON(work_pending(&ssp->srcu_work));
-	else
-		flush_work(&ssp->srcu_work);
+	flush_work(&ssp->srcu_work);
 	WARN_ON(ssp->srcu_gp_running);
 	WARN_ON(ssp->srcu_gp_waiting);
 	WARN_ON(ssp->srcu_cb_head);
 	WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
 /*
  * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 4f30f3ecabc1..9b761e546de8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 	return SRCU_INTERVAL;
 }
 
-/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
 {
 	int cpu;
 
@@ -369,24 +375,12 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
 		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(ssp)))
 		return; /* Just leak it! */
-	if (quiesced) {
-		if (WARN_ON(delayed_work_pending(&ssp->work)))
-			return; /* Just leak it! */
-	} else {
-		flush_delayed_work(&ssp->work);
-	}
+	flush_delayed_work(&ssp->work);
 	for_each_possible_cpu(cpu) {
 		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
-		if (quiesced) {
-			if (WARN_ON(timer_pending(&sdp->delay_work)))
-				return; /* Just leak it! */
-			if (WARN_ON(work_pending(&sdp->work)))
-				return; /* Just leak it! */
-		} else {
-			del_timer_sync(&sdp->delay_work);
-			flush_work(&sdp->work);
-		}
+		del_timer_sync(&sdp->delay_work);
+		flush_work(&sdp->work);
 		if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
 			return; /* Forgot srcu_barrier(), so just leak it! */
 	}
@@ -399,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
 	free_percpu(ssp->sda);
 	ssp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
-- 
cgit v1.2.3


From 10462d6f58fb6dbde7563e9343505d98d5bfba3d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 16:10:57 -0800
Subject: rcu: Move RCU CPU stall-warning code out of update.c

The RCU CPU stall-warning code for normal grace periods is currently
scattered across three files, due to earlier Tiny RCU support for RCU
CPU stall warnings and for old Kconfig options that have long since
been retired.  Given that it is hard for the lead RCU maintainer to
find relevant stall-warning code, it would be good to consolidate it.
This commit starts this process by moving stall-warning code from
kernel/rcu/update.c to a new kernel/rcu/tree_stall.h file.

Note that the definitions of rcu_cpu_stall_suppress and
rcu_cpu_stall_timeout must remain in kernel/rcu/update.h to provide
compatibility for kernel boot parameter lists.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcu.h        |  1 +
 kernel/rcu/tree.c       |  1 +
 kernel/rcu/tree_stall.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/update.c     | 59 +--------------------------------------------
 4 files changed, 66 insertions(+), 58 deletions(-)
 create mode 100644 kernel/rcu/tree_stall.h

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..4b58c907b4b7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 #ifdef CONFIG_RCU_STALL_COMMON
 
 extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
 int rcu_jiffies_till_stall_check(void);
 
 #define rcu_ftrace_dump_stall_suppress() \
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..424d50ccf9e6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3858,5 +3858,6 @@ void __init rcu_init(void)
 	srcu_init();
 }
 
+#include "tree_stall.h"
 #include "tree_exp.h"
 #include "tree_plugin.h"
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..682189f4d083
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA	       0
+#endif
+
+int rcu_jiffies_till_stall_check(void)
+{
+	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+
+	/*
+	 * Limit check must be consistent with the Kconfig limits
+	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+	 */
+	if (till_stall_check < 3) {
+		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+		till_stall_check = 3;
+	} else if (till_stall_check > 300) {
+		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+		till_stall_check = 300;
+	}
+	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+
+void rcu_sysrq_start(void)
+{
+	if (!rcu_cpu_stall_suppress)
+		rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+	if (rcu_cpu_stall_suppress == 2)
+		rcu_cpu_stall_suppress = 0;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_suppress = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+	return 0;
+}
+early_initcall(check_cpu_stall_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index cbaa976c5945..c3bf44ba42e5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #endif
 
 #ifdef CONFIG_RCU_STALL_COMMON
-
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA	       0
-#endif
-
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
 module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_timeout, int, 0644);
-
-int rcu_jiffies_till_stall_check(void)
-{
-	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
-
-	/*
-	 * Limit check must be consistent with the Kconfig limits
-	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-	 */
-	if (till_stall_check < 3) {
-		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
-		till_stall_check = 3;
-	} else if (till_stall_check > 300) {
-		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
-		till_stall_check = 300;
-	}
-	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-
-void rcu_sysrq_start(void)
-{
-	if (!rcu_cpu_stall_suppress)
-		rcu_cpu_stall_suppress = 2;
-}
-
-void rcu_sysrq_end(void)
-{
-	if (rcu_cpu_stall_suppress == 2)
-		rcu_cpu_stall_suppress = 0;
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-	rcu_cpu_stall_suppress = 1;
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block rcu_panic_block = {
-	.notifier_call = rcu_panic,
-};
-
-static int __init check_cpu_stall_init(void)
-{
-	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-	return 0;
-}
-early_initcall(check_cpu_stall_init);
-
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 
 #ifdef CONFIG_TASKS_RCU
-- 
cgit v1.2.3


From 3fc3d1709fc75995ee09ad4f35f160cf360b397b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 16:34:47 -0800
Subject: rcu: Move RCU CPU stall-warning code out of tree_plugin.h

The RCU CPU stall-warning code for normal grace periods is currently
scattered across two files, due to earlier Tiny RCU support for RCU
CPU stall warnings and for old Kconfig options that have long since
been retired.  Given that it is hard for the lead RCU maintainer to
find relevant stall-warning code, it would be good to consolidate it.
This commit continues this process by moving stall-warning code from
kernel/rcu/tree_plugin.c to a new kernel/rcu/tree_stall.h file.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_plugin.h | 90 ---------------------------------------------
 kernel/rcu/tree_stall.h  | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..7fa3bc4d481b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -642,79 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 }
 
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
-	unsigned long flags;
-	struct task_struct *t;
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		return;
-	}
-	t = list_entry(rnp->gp_tasks->prev,
-		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-		/*
-		 * We could be printing a lot while holding a spinlock.
-		 * Avoid triggering hard lockup.
-		 */
-		touch_nmi_watchdog();
-		sched_show_task(t);
-	}
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
-	struct rcu_node *rnp = rcu_get_root();
-
-	rcu_print_detail_task_stall_rnp(rnp);
-	rcu_for_each_leaf_node(rnp)
-		rcu_print_detail_task_stall_rnp(rnp);
-}
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
-	       rnp->level, rnp->grplo, rnp->grphi);
-}
-
-static void rcu_print_task_stall_end(void)
-{
-	pr_cont("\n");
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
-	struct task_struct *t;
-	int ndetected = 0;
-
-	if (!rcu_preempt_blocked_readers_cgp(rnp))
-		return 0;
-	rcu_print_task_stall_begin(rnp);
-	t = list_entry(rnp->gp_tasks->prev,
-		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-		pr_cont(" P%d", t->pid);
-		ndetected++;
-	}
-	rcu_print_task_stall_end();
-	return ndetected;
-}
-
 /*
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each that is blocking the current
@@ -979,23 +906,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 }
 static void rcu_preempt_deferred_qs(struct task_struct *t) { }
 
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
-	return 0;
-}
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections that are
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 682189f4d083..6f5f94944f49 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -61,3 +61,98 @@ static int __init check_cpu_stall_init(void)
 	return 0;
 }
 early_initcall(check_cpu_stall_init);
+
+#ifdef CONFIG_PREEMPT
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+	unsigned long flags;
+	struct task_struct *t;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		return;
+	}
+	t = list_entry(rnp->gp_tasks->prev,
+		       struct task_struct, rcu_node_entry);
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+		/*
+		 * We could be printing a lot while holding a spinlock.
+		 * Avoid triggering hard lockup.
+		 */
+		touch_nmi_watchdog();
+		sched_show_task(t);
+	}
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(void)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	rcu_print_detail_task_stall_rnp(rnp);
+	rcu_for_each_leaf_node(rnp)
+		rcu_print_detail_task_stall_rnp(rnp);
+}
+
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+	       rnp->level, rnp->grplo, rnp->grphi);
+}
+
+static void rcu_print_task_stall_end(void)
+{
+	pr_cont("\n");
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+	struct task_struct *t;
+	int ndetected = 0;
+
+	if (!rcu_preempt_blocked_readers_cgp(rnp))
+		return 0;
+	rcu_print_task_stall_begin(rnp);
+	t = list_entry(rnp->gp_tasks->prev,
+		       struct task_struct, rcu_node_entry);
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+		pr_cont(" P%d", t->pid);
+		ndetected++;
+	}
+	rcu_print_task_stall_end();
+	return ndetected;
+}
+
+#else /* #ifdef CONFIG_PREEMPT */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+	return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT */
-- 
cgit v1.2.3


From 32255d51b6ed00de2b88970ceea8db0ec3bae6f8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 16:57:41 -0800
Subject: rcu: Move RCU CPU stall-warning code out of tree.c

This commit completes the process of consolidating the code for RCU CPU
stall warnings for normal grace periods by moving the remaining such
code from kernel/rcu/tree.c to kernel/rcu/tree_stall.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c       | 291 -----------------------------------------------
 kernel/rcu/tree.h       |  10 +-
 kernel/rcu/tree_stall.h | 292 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 299 insertions(+), 294 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 424d50ccf9e6..001dd05f6e38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,8 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
 int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
 /* Commandeer a sysrq key to dump RCU's tree. */
 static bool sysrq_rcu;
 module_param(sysrq_rcu, bool, 0444);
@@ -1167,295 +1165,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	return 0;
 }
 
-static void record_gp_stall_check_time(void)
-{
-	unsigned long j = jiffies;
-	unsigned long j1;
-
-	rcu_state.gp_start = j;
-	j1 = rcu_jiffies_till_stall_check();
-	/* Record ->gp_start before ->jiffies_stall. */
-	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
-	rcu_state.jiffies_resched = j + j1 / 2;
-	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
-	struct task_struct *gpk = rcu_state.gp_kthread;
-	unsigned long j;
-
-	j = jiffies - READ_ONCE(rcu_state.gp_activity);
-	if (j > 2 * HZ) {
-		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
-		       rcu_state.name, j,
-		       (long)rcu_seq_current(&rcu_state.gp_seq),
-		       READ_ONCE(rcu_state.gp_flags),
-		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
-		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
-		if (gpk) {
-			pr_err("RCU grace-period kthread stack dump:\n");
-			sched_show_task(gpk);
-			wake_up_process(gpk);
-		}
-	}
-}
-
-/*
- * Dump stacks of all tasks running on stalled CPUs.  First try using
- * NMIs, but fall back to manual remote stack tracing on architectures
- * that don't support NMI-based stack dumps.  The NMI-triggered stack
- * traces are more accurate because they are printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(void)
-{
-	int cpu;
-	unsigned long flags;
-	struct rcu_node *rnp;
-
-	rcu_for_each_leaf_node(rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		for_each_leaf_node_possible_cpu(rnp, cpu)
-			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
-				if (!trigger_single_cpu_backtrace(cpu))
-					dump_cpu_task(cpu);
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	}
-}
-
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
-	unsigned long j;
-
-	if (!rcu_kick_kthreads)
-		return;
-	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
-	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
-	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
-		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
-			  rcu_state.name);
-		rcu_ftrace_dump(DUMP_ALL);
-		wake_up_process(rcu_state.gp_kthread);
-		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
-	}
-}
-
-static void panic_on_rcu_stall(void)
-{
-	if (sysctl_panic_on_rcu_stall)
-		panic("RCU Stall\n");
-}
-
-static void print_other_cpu_stall(unsigned long gp_seq)
-{
-	int cpu;
-	unsigned long flags;
-	unsigned long gpa;
-	unsigned long j;
-	int ndetected = 0;
-	struct rcu_node *rnp = rcu_get_root();
-	long totqlen = 0;
-
-	/* Kick and suppress, if so configured. */
-	rcu_stall_kick_kthreads();
-	if (rcu_cpu_stall_suppress)
-		return;
-
-	/*
-	 * OK, time to rat on our buddy...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
-	 * RCU CPU stall warnings.
-	 */
-	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
-	print_cpu_stall_info_begin();
-	rcu_for_each_leaf_node(rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		ndetected += rcu_print_task_stall(rnp);
-		if (rnp->qsmask != 0) {
-			for_each_leaf_node_possible_cpu(rnp, cpu)
-				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
-					print_cpu_stall_info(cpu);
-					ndetected++;
-				}
-		}
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	}
-
-	print_cpu_stall_info_end();
-	for_each_possible_cpu(cpu)
-		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
-	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
-	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-	if (ndetected) {
-		rcu_dump_cpu_stacks();
-
-		/* Complain about tasks blocking the grace period. */
-		rcu_print_detail_task_stall();
-	} else {
-		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
-			pr_err("INFO: Stall ended before state dump start\n");
-		} else {
-			j = jiffies;
-			gpa = READ_ONCE(rcu_state.gp_activity);
-			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
-			       rcu_state.name, j - gpa, j, gpa,
-			       READ_ONCE(jiffies_till_next_fqs),
-			       rcu_get_root()->qsmask);
-			/* In this case, the current CPU might be at fault. */
-			sched_show_task(current);
-		}
-	}
-	/* Rewrite if needed in case of slow consoles. */
-	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
-		WRITE_ONCE(rcu_state.jiffies_stall,
-			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-
-	rcu_check_gp_kthread_starvation();
-
-	panic_on_rcu_stall();
-
-	rcu_force_quiescent_state();  /* Kick them all. */
-}
-
-static void print_cpu_stall(void)
-{
-	int cpu;
-	unsigned long flags;
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-	struct rcu_node *rnp = rcu_get_root();
-	long totqlen = 0;
-
-	/* Kick and suppress, if so configured. */
-	rcu_stall_kick_kthreads();
-	if (rcu_cpu_stall_suppress)
-		return;
-
-	/*
-	 * OK, time to rat on ourselves...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
-	 * RCU CPU stall warnings.
-	 */
-	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
-	print_cpu_stall_info_begin();
-	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
-	print_cpu_stall_info(smp_processor_id());
-	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
-	print_cpu_stall_info_end();
-	for_each_possible_cpu(cpu)
-		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
-		jiffies - rcu_state.gp_start,
-		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-
-	rcu_check_gp_kthread_starvation();
-
-	rcu_dump_cpu_stacks();
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	/* Rewrite if needed in case of slow consoles. */
-	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
-		WRITE_ONCE(rcu_state.jiffies_stall,
-			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-	panic_on_rcu_stall();
-
-	/*
-	 * Attempt to revive the RCU machinery by forcing a context switch.
-	 *
-	 * A context switch would normally allow the RCU state machine to make
-	 * progress and it could be we're stuck in kernel space without context
-	 * switches for an entirely unreasonable amount of time.
-	 */
-	set_tsk_need_resched(current);
-	set_preempt_need_resched();
-}
-
-static void check_cpu_stall(struct rcu_data *rdp)
-{
-	unsigned long gs1;
-	unsigned long gs2;
-	unsigned long gps;
-	unsigned long j;
-	unsigned long jn;
-	unsigned long js;
-	struct rcu_node *rnp;
-
-	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
-	    !rcu_gp_in_progress())
-		return;
-	rcu_stall_kick_kthreads();
-	j = jiffies;
-
-	/*
-	 * Lots of memory barriers to reject false positives.
-	 *
-	 * The idea is to pick up rcu_state.gp_seq, then
-	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
-	 * another copy of rcu_state.gp_seq.  These values are updated in
-	 * the opposite order with memory barriers (or equivalent) during
-	 * grace-period initialization and cleanup.  Now, a false positive
-	 * can occur if we get an new value of rcu_state.gp_start and a old
-	 * value of rcu_state.jiffies_stall.  But given the memory barriers,
-	 * the only way that this can happen is if one grace period ends
-	 * and another starts between these two fetches.  This is detected
-	 * by comparing the second fetch of rcu_state.gp_seq with the
-	 * previous fetch from rcu_state.gp_seq.
-	 *
-	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
-	 * and rcu_state.gp_start suffice to forestall false positives.
-	 */
-	gs1 = READ_ONCE(rcu_state.gp_seq);
-	smp_rmb(); /* Pick up ->gp_seq first... */
-	js = READ_ONCE(rcu_state.jiffies_stall);
-	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-	gps = READ_ONCE(rcu_state.gp_start);
-	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
-	gs2 = READ_ONCE(rcu_state.gp_seq);
-	if (gs1 != gs2 ||
-	    ULONG_CMP_LT(j, js) ||
-	    ULONG_CMP_GE(gps, js))
-		return; /* No stall or GP completed since entering function. */
-	rnp = rdp->mynode;
-	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
-	if (rcu_gp_in_progress() &&
-	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
-	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
-		/* We haven't checked in, so go dump stack. */
-		print_cpu_stall();
-
-	} else if (rcu_gp_in_progress() &&
-		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
-		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
-		/* They had a few time units to dump stack, so complain. */
-		print_other_cpu_stall(gs2);
-	}
-}
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
-	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
-
 /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
 static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 			      unsigned long gp_seq_req, const char *s)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bb4f995f2d3f..3c4e26fff806 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
 
 int rcu_dynticks_snap(struct rcu_data *rdp);
 
-/* Forward declarations for rcutree_plugin.h */
+/* Forward declarations for tree_plugin.h */
 static void rcu_bootup_announce(void);
 static void rcu_qs(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(void);
-static int rcu_print_task_stall(struct rcu_node *rnp);
 static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_flavor_sched_clock_irq(int user);
@@ -445,3 +443,9 @@ static void rcu_bind_gp_kthread(void);
 static bool rcu_nohz_full_cpu(void);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
+
+/* Forward declarations for tree_stall.h */
+static void rcu_print_detail_task_stall(void);
+static int rcu_print_task_stall(struct rcu_node *rnp);
+static void record_gp_stall_check_time(void);
+static void check_cpu_stall(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6f5f94944f49..e0e73f493363 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,9 @@
  */
 
 
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
+
 #ifdef CONFIG_PROVE_RCU
 #define RCU_STALL_DELAY_DELTA	       (5 * HZ)
 #else
@@ -156,3 +159,292 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return 0;
 }
 #endif /* #else #ifdef CONFIG_PREEMPT */
+
+static void record_gp_stall_check_time(void)
+{
+	unsigned long j = jiffies;
+	unsigned long j1;
+
+	rcu_state.gp_start = j;
+	j1 = rcu_jiffies_till_stall_check();
+	/* Record ->gp_start before ->jiffies_stall. */
+	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+	rcu_state.jiffies_resched = j + j1 / 2;
+	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(void)
+{
+	struct task_struct *gpk = rcu_state.gp_kthread;
+	unsigned long j;
+
+	j = jiffies - READ_ONCE(rcu_state.gp_activity);
+	if (j > 2 * HZ) {
+		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+		       rcu_state.name, j,
+		       (long)rcu_seq_current(&rcu_state.gp_seq),
+		       READ_ONCE(rcu_state.gp_flags),
+		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+		if (gpk) {
+			pr_err("RCU grace-period kthread stack dump:\n");
+			sched_show_task(gpk);
+			wake_up_process(gpk);
+		}
+	}
+}
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs.  First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps.  The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(void)
+{
+	int cpu;
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	rcu_for_each_leaf_node(rnp) {
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		for_each_leaf_node_possible_cpu(rnp, cpu)
+			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+				if (!trigger_single_cpu_backtrace(cpu))
+					dump_cpu_task(cpu);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+	unsigned long j;
+
+	if (!rcu_kick_kthreads)
+		return;
+	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+			  rcu_state.name);
+		rcu_ftrace_dump(DUMP_ALL);
+		wake_up_process(rcu_state.gp_kthread);
+		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+	}
+}
+
+static void panic_on_rcu_stall(void)
+{
+	if (sysctl_panic_on_rcu_stall)
+		panic("RCU Stall\n");
+}
+
+static void print_other_cpu_stall(unsigned long gp_seq)
+{
+	int cpu;
+	unsigned long flags;
+	unsigned long gpa;
+	unsigned long j;
+	int ndetected = 0;
+	struct rcu_node *rnp = rcu_get_root();
+	long totqlen = 0;
+
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads();
+	if (rcu_cpu_stall_suppress)
+		return;
+
+	/*
+	 * OK, time to rat on our buddy...
+	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * RCU CPU stall warnings.
+	 */
+	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
+	print_cpu_stall_info_begin();
+	rcu_for_each_leaf_node(rnp) {
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		ndetected += rcu_print_task_stall(rnp);
+		if (rnp->qsmask != 0) {
+			for_each_leaf_node_possible_cpu(rnp, cpu)
+				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+					print_cpu_stall_info(cpu);
+					ndetected++;
+				}
+		}
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+
+	print_cpu_stall_info_end();
+	for_each_possible_cpu(cpu)
+		totqlen += rcu_get_n_cbs_cpu(cpu);
+	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+	if (ndetected) {
+		rcu_dump_cpu_stacks();
+
+		/* Complain about tasks blocking the grace period. */
+		rcu_print_detail_task_stall();
+	} else {
+		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+			pr_err("INFO: Stall ended before state dump start\n");
+		} else {
+			j = jiffies;
+			gpa = READ_ONCE(rcu_state.gp_activity);
+			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+			       rcu_state.name, j - gpa, j, gpa,
+			       READ_ONCE(jiffies_till_next_fqs),
+			       rcu_get_root()->qsmask);
+			/* In this case, the current CPU might be at fault. */
+			sched_show_task(current);
+		}
+	}
+	/* Rewrite if needed in case of slow consoles. */
+	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+		WRITE_ONCE(rcu_state.jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+
+	rcu_check_gp_kthread_starvation();
+
+	panic_on_rcu_stall();
+
+	rcu_force_quiescent_state();  /* Kick them all. */
+}
+
+static void print_cpu_stall(void)
+{
+	int cpu;
+	unsigned long flags;
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+	struct rcu_node *rnp = rcu_get_root();
+	long totqlen = 0;
+
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads();
+	if (rcu_cpu_stall_suppress)
+		return;
+
+	/*
+	 * OK, time to rat on ourselves...
+	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * RCU CPU stall warnings.
+	 */
+	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
+	print_cpu_stall_info_begin();
+	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+	print_cpu_stall_info(smp_processor_id());
+	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+	print_cpu_stall_info_end();
+	for_each_possible_cpu(cpu)
+		totqlen += rcu_get_n_cbs_cpu(cpu);
+	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
+		jiffies - rcu_state.gp_start,
+		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+
+	rcu_check_gp_kthread_starvation();
+
+	rcu_dump_cpu_stacks();
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	/* Rewrite if needed in case of slow consoles. */
+	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+		WRITE_ONCE(rcu_state.jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	panic_on_rcu_stall();
+
+	/*
+	 * Attempt to revive the RCU machinery by forcing a context switch.
+	 *
+	 * A context switch would normally allow the RCU state machine to make
+	 * progress and it could be we're stuck in kernel space without context
+	 * switches for an entirely unreasonable amount of time.
+	 */
+	set_tsk_need_resched(current);
+	set_preempt_need_resched();
+}
+
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+	unsigned long gs1;
+	unsigned long gs2;
+	unsigned long gps;
+	unsigned long j;
+	unsigned long jn;
+	unsigned long js;
+	struct rcu_node *rnp;
+
+	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+	    !rcu_gp_in_progress())
+		return;
+	rcu_stall_kick_kthreads();
+	j = jiffies;
+
+	/*
+	 * Lots of memory barriers to reject false positives.
+	 *
+	 * The idea is to pick up rcu_state.gp_seq, then
+	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+	 * another copy of rcu_state.gp_seq.  These values are updated in
+	 * the opposite order with memory barriers (or equivalent) during
+	 * grace-period initialization and cleanup.  Now, a false positive
+	 * can occur if we get an new value of rcu_state.gp_start and a old
+	 * value of rcu_state.jiffies_stall.  But given the memory barriers,
+	 * the only way that this can happen is if one grace period ends
+	 * and another starts between these two fetches.  This is detected
+	 * by comparing the second fetch of rcu_state.gp_seq with the
+	 * previous fetch from rcu_state.gp_seq.
+	 *
+	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+	 * and rcu_state.gp_start suffice to forestall false positives.
+	 */
+	gs1 = READ_ONCE(rcu_state.gp_seq);
+	smp_rmb(); /* Pick up ->gp_seq first... */
+	js = READ_ONCE(rcu_state.jiffies_stall);
+	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+	gps = READ_ONCE(rcu_state.gp_start);
+	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+	gs2 = READ_ONCE(rcu_state.gp_seq);
+	if (gs1 != gs2 ||
+	    ULONG_CMP_LT(j, js) ||
+	    ULONG_CMP_GE(gps, js))
+		return; /* No stall or GP completed since entering function. */
+	rnp = rdp->mynode;
+	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+	if (rcu_gp_in_progress() &&
+	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall();
+
+	} else if (rcu_gp_in_progress() &&
+		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+		/* They had a few time units to dump stack, so complain. */
+		print_other_cpu_stall(gs2);
+	}
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
-- 
cgit v1.2.3


From 21d0d79ab051bf9facb9960a30e58b93a31c75a5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 20:36:45 -0800
Subject: rcu: Inline RCU task stall-warning helper functions

The rcu_print_detail_task_stall(), rcu_print_task_stall_begin(), and
rcu_print_task_stall_end() functions were defined to allow long-gone
Kconfig options to provide an abbreviated RCU CPU stall warning printout.
This commit saves a few lines of code by inlining them into their sole
callers.

While in the area, a useless call of rcu_print_detail_task_stall_rnp()
on the root rcu_node structure was eliminated.  If there is only one
rcu_node structure, its tasks get printed twice, but if there are more,
the root rcu_node structure is guaranteed to have an empty list of blocked
tasks, hence the uselessness.  (Long ago, root rcu_node structures with
non-empty ->blkd_tasks lists could happen, but no longer.)

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h       |  1 -
 kernel/rcu/tree_stall.h | 36 +++++++-----------------------------
 2 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3c4e26fff806..c6df9a13dd06 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -445,7 +445,6 @@ static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
 
 /* Forward declarations for tree_stall.h */
-static void rcu_print_detail_task_stall(void);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void record_gp_stall_check_time(void);
 static void check_cpu_stall(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index e0e73f493363..b476786b8ef7 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -94,30 +94,6 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
-	struct rcu_node *rnp = rcu_get_root();
-
-	rcu_print_detail_task_stall_rnp(rnp);
-	rcu_for_each_leaf_node(rnp)
-		rcu_print_detail_task_stall_rnp(rnp);
-}
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
-	       rnp->level, rnp->grplo, rnp->grphi);
-}
-
-static void rcu_print_task_stall_end(void)
-{
-	pr_cont("\n");
-}
-
 /*
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each.
@@ -129,14 +105,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
 		return 0;
-	rcu_print_task_stall_begin(rnp);
+	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+	       rnp->level, rnp->grplo, rnp->grphi);
 	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		pr_cont(" P%d", t->pid);
 		ndetected++;
 	}
-	rcu_print_task_stall_end();
+	pr_cont("\n");
 	return ndetected;
 }
 
@@ -146,7 +123,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
-static void rcu_print_detail_task_stall(void)
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 {
 }
 
@@ -253,7 +230,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
 	unsigned long gpa;
 	unsigned long j;
 	int ndetected = 0;
-	struct rcu_node *rnp = rcu_get_root();
+	struct rcu_node *rnp;
 	long totqlen = 0;
 
 	/* Kick and suppress, if so configured. */
@@ -291,7 +268,8 @@ static void print_other_cpu_stall(unsigned long gp_seq)
 		rcu_dump_cpu_stacks();
 
 		/* Complain about tasks blocking the grace period. */
-		rcu_print_detail_task_stall();
+		rcu_for_each_leaf_node(rnp)
+			rcu_print_detail_task_stall_rnp(rnp);
 	} else {
 		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
 			pr_err("INFO: Stall ended before state dump start\n");
-- 
cgit v1.2.3


From d87cda5094585b7a0f62075de68266cb9c1b35ca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 20:51:49 -0800
Subject: rcu: Move rcu_print_task_exp_stall() to tree_exp.h

Because expedited CPU stall warnings are contained within the
kernel/rcu/tree_exp.h file, rcu_print_task_exp_stall() should live
there too.  This commit carries out the required code motion.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_exp.h    | 32 ++++++++++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 31 -------------------------------
 2 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..7be3e085ddd6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -10,6 +10,7 @@
 #include <linux/lockdep.h>
 
 static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 
 /*
  * Record the start of an expedited grace period.
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)
 {
 }
 
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+	struct task_struct *t;
+	int ndetected = 0;
+
+	if (!rnp->exp_tasks)
+		return 0;
+	t = list_entry(rnp->exp_tasks->prev,
+		       struct task_struct, rcu_node_entry);
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+		pr_cont(" P%d", t->pid);
+		ndetected++;
+	}
+	return ndetected;
+}
+
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)
 	WARN_ON_ONCE(ret);
 }
 
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+	return 0;
+}
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7fa3bc4d481b..72519c57f656 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -642,27 +642,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 }
 
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each that is blocking the current
- * expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
-	struct task_struct *t;
-	int ndetected = 0;
-
-	if (!rnp->exp_tasks)
-		return 0;
-	t = list_entry(rnp->exp_tasks->prev,
-		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-		pr_cont(" P%d", t->pid);
-		ndetected++;
-	}
-	return ndetected;
-}
-
 /*
  * Check that the list of blocked tasks for the newly completed grace
  * period is in fact empty.  It is a serious bug to complete a grace
@@ -906,16 +885,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 }
 static void rcu_preempt_deferred_qs(struct task_struct *t) { }
 
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections that are
- * blocking the current expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
-	return 0;
-}
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked,
  * so there is no need to check for blocked tasks.  So check only for
-- 
cgit v1.2.3


From 40e69ac7d0c5a19ea14656bc3131c55719baec96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 20:58:58 -0800
Subject: rcu: Inline RCU stall-warning info helper functions

The print_cpu_stall_info_begin() and print_cpu_stall_info_end() print a
single character each onto the console, and are a holdover from a time
when RCU CPU stall warning messages could be abbreviated using a long-gone
Kconfig option.  This commit therefore adds these single characters to
already-printed strings in the calling functions, and then eliminates
both print_cpu_stall_info_begin() and print_cpu_stall_info_end().

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h        |  2 --
 kernel/rcu/tree_plugin.h | 12 ------------
 kernel/rcu/tree_stall.h  | 12 ++++--------
 3 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c6df9a13dd06..d73472af49e7 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -416,9 +416,7 @@ static void rcu_prepare_for_idle(void);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info_begin(void);
 static void print_cpu_stall_info(int cpu);
-static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static bool rcu_nocb_cpu_needs_barrier(int cpu);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 72519c57f656..2df5bb04fd7a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1550,12 +1550,6 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 
 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
 
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
-	pr_cont("\n");
-}
-
 /*
  * Print out diagnostic information for the specified stalled CPU.
  *
@@ -1606,12 +1600,6 @@ static void print_cpu_stall_info(int cpu)
 	       fast_no_hz);
 }
 
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
-	pr_err("\t");
-}
-
 /* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 {
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b476786b8ef7..7ef3b596e45f 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -243,8 +243,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
-	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
-	print_cpu_stall_info_begin();
+	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
 	rcu_for_each_leaf_node(rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		ndetected += rcu_print_task_stall(rnp);
@@ -258,10 +257,9 @@ static void print_other_cpu_stall(unsigned long gp_seq)
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 
-	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
 		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
 	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
 	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
 	if (ndetected) {
@@ -314,15 +312,13 @@ static void print_cpu_stall(void)
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
-	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
-	print_cpu_stall_info_begin();
+	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
 	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
 	print_cpu_stall_info(smp_processor_id());
 	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
-	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
 		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
+	pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
 		jiffies - rcu_state.gp_start,
 		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
 
-- 
cgit v1.2.3


From 59b73a27681c5841440391f970a9a085228ba975 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Fri, 11 Jan 2019 21:05:17 -0800
Subject: rcu: Move FAST_NO_HZ stall-warning code to tree_stall.h

This commit further consolidates the stall-warning code by moving
print_cpu_stall_info() and its helper functions along with
zero_cpu_stall_ticks() to kernel/rcu/tree_stall.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h        |  1 -
 kernel/rcu/tree_plugin.h | 80 ------------------------------------------------
 kernel/rcu/tree_stall.h  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d73472af49e7..49bf3b00bb50 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -416,7 +416,6 @@ static void rcu_prepare_for_idle(void);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info(int cpu);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static bool rcu_nocb_cpu_needs_barrier(int cpu);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 2df5bb04fd7a..a1f9d7c15bd8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1528,86 +1528,6 @@ static void rcu_cleanup_after_idle(void)
 
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
-	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
-		rdp->last_accelerate & 0xffff, jiffies & 0xffff,
-		".l"[rdp->all_lazy],
-		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
-		".D"[!rdp->tick_nohz_enabled_snap]);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-	*cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period, then
- * print the number of scheduling clock interrupts the CPU has taken
- * during the time that it has been aware.  Otherwise, print the number
- * of RCU grace periods that this CPU is ignorant of, for example, "1"
- * if the CPU was aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(int cpu)
-{
-	unsigned long delta;
-	char fast_no_hz[72];
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-	char *ticks_title;
-	unsigned long ticks_value;
-
-	/*
-	 * We could be printing a lot while holding a spinlock.  Avoid
-	 * triggering hard lockup.
-	 */
-	touch_nmi_watchdog();
-
-	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
-	if (ticks_value) {
-		ticks_title = "GPs behind";
-	} else {
-		ticks_title = "ticks this GP";
-		ticks_value = rdp->ticks_this_gp;
-	}
-	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
-	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
-	       cpu,
-	       "O."[!!cpu_online(cpu)],
-	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
-	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
-	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
-			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
-				"!."[!delta],
-	       ticks_value, ticks_title,
-	       rcu_dynticks_snap(rdp) & 0xfff,
-	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
-	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
-	       fast_no_hz);
-}
-
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-	rdp->ticks_this_gp = 0;
-	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
-	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
-}
-
 #ifdef CONFIG_RCU_NOCB_CPU
 
 /*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 7ef3b596e45f..19b915380a6f 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -223,6 +223,86 @@ static void panic_on_rcu_stall(void)
 		panic("RCU Stall\n");
 }
 
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+		rdp->last_accelerate & 0xffff, jiffies & 0xffff,
+		".l"[rdp->all_lazy],
+		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+		".D"[!rdp->tick_nohz_enabled_snap]);
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+	*cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware.  Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+	unsigned long delta;
+	char fast_no_hz[72];
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	char *ticks_title;
+	unsigned long ticks_value;
+
+	/*
+	 * We could be printing a lot while holding a spinlock.  Avoid
+	 * triggering hard lockup.
+	 */
+	touch_nmi_watchdog();
+
+	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+	if (ticks_value) {
+		ticks_title = "GPs behind";
+	} else {
+		ticks_title = "ticks this GP";
+		ticks_value = rdp->ticks_this_gp;
+	}
+	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+	       cpu,
+	       "O."[!!cpu_online(cpu)],
+	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+				"!."[!delta],
+	       ticks_value, ticks_title,
+	       rcu_dynticks_snap(rdp) & 0xfff,
+	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+	       fast_no_hz);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+	rdp->ticks_this_gp = 0;
+	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
 static void print_other_cpu_stall(unsigned long gp_seq)
 {
 	int cpu;
-- 
cgit v1.2.3


From e23344c2ca42d0083596bb39964675bef00ad691 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Sat, 12 Jan 2019 09:35:44 -0800
Subject: rcu: Organize functions in tree_stall.h

This commit does only code movement, removal of now-unneeded forward
declarations, and addition of comments.  It organizes the functions
that implement RCU CPU stall warnings for normal grace periods into
three categories:

1.	Control of RCU CPU stall warnings, including computing timeouts.

2.	Interaction of stall warnings with grace periods.

3.	Actual printing of the RCU CPU stall-warning messages.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.h       |   1 -
 kernel/rcu/tree_stall.h | 180 ++++++++++++++++++++++++++----------------------
 2 files changed, 97 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 49bf3b00bb50..099410dbcbe9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -442,6 +442,5 @@ static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
 
 /* Forward declarations for tree_stall.h */
-static int rcu_print_task_stall(struct rcu_node *rnp);
 static void record_gp_stall_check_time(void);
 static void check_cpu_stall(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 19b915380a6f..03ed47883d8a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,9 @@
  * Author: Paul E. McKenney <paulmck@linux.ibm.com>
  */
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
 
 /* panic() on RCU Stall sysctl. */
 int sysctl_panic_on_rcu_stall __read_mostly;
@@ -17,6 +20,7 @@ int sysctl_panic_on_rcu_stall __read_mostly;
 #define RCU_STALL_DELAY_DELTA	       0
 #endif
 
+/* Limit-check stall timeouts specified at boottime and runtime. */
 int rcu_jiffies_till_stall_check(void)
 {
 	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
@@ -36,6 +40,7 @@ int rcu_jiffies_till_stall_check(void)
 }
 EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
 
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
 void rcu_sysrq_start(void)
 {
 	if (!rcu_cpu_stall_suppress)
@@ -48,6 +53,7 @@ void rcu_sysrq_end(void)
 		rcu_cpu_stall_suppress = 0;
 }
 
+/* Don't print RCU CPU stall warnings during a kernel panic. */
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
 	rcu_cpu_stall_suppress = 1;
@@ -65,6 +71,78 @@ static int __init check_cpu_stall_init(void)
 }
 early_initcall(check_cpu_stall_init);
 
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+	if (sysctl_panic_on_rcu_stall)
+		panic("RCU Stall\n");
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+	unsigned long j = jiffies;
+	unsigned long j1;
+
+	rcu_state.gp_start = j;
+	j1 = rcu_jiffies_till_stall_check();
+	/* Record ->gp_start before ->jiffies_stall. */
+	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+	rcu_state.jiffies_resched = j + j1 / 2;
+	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+	rdp->ticks_this_gp = 0;
+	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+	unsigned long j;
+
+	if (!rcu_kick_kthreads)
+		return;
+	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+			  rcu_state.name);
+		rcu_ftrace_dump(DUMP_ALL);
+		wake_up_process(rcu_state.gp_kthread);
+		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+	}
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+
 #ifdef CONFIG_PREEMPT
 
 /*
@@ -137,43 +215,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 #endif /* #else #ifdef CONFIG_PREEMPT */
 
-static void record_gp_stall_check_time(void)
-{
-	unsigned long j = jiffies;
-	unsigned long j1;
-
-	rcu_state.gp_start = j;
-	j1 = rcu_jiffies_till_stall_check();
-	/* Record ->gp_start before ->jiffies_stall. */
-	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
-	rcu_state.jiffies_resched = j + j1 / 2;
-	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
-	struct task_struct *gpk = rcu_state.gp_kthread;
-	unsigned long j;
-
-	j = jiffies - READ_ONCE(rcu_state.gp_activity);
-	if (j > 2 * HZ) {
-		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
-		       rcu_state.name, j,
-		       (long)rcu_seq_current(&rcu_state.gp_seq),
-		       READ_ONCE(rcu_state.gp_flags),
-		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
-		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
-		if (gpk) {
-			pr_err("RCU grace-period kthread stack dump:\n");
-			sched_show_task(gpk);
-			wake_up_process(gpk);
-		}
-	}
-}
-
 /*
  * Dump stacks of all tasks running on stalled CPUs.  First try using
  * NMIs, but fall back to manual remote stack tracing on architectures
@@ -196,33 +237,6 @@ static void rcu_dump_cpu_stacks(void)
 	}
 }
 
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
-	unsigned long j;
-
-	if (!rcu_kick_kthreads)
-		return;
-	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
-	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
-	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
-		WARN_ONCE(1, "Kicking %s grace-period kthread\n",
-			  rcu_state.name);
-		rcu_ftrace_dump(DUMP_ALL);
-		wake_up_process(rcu_state.gp_kthread);
-		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
-	}
-}
-
-static void panic_on_rcu_stall(void)
-{
-	if (sysctl_panic_on_rcu_stall)
-		panic("RCU Stall\n");
-}
-
 #ifdef CONFIG_RCU_FAST_NO_HZ
 
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -295,12 +309,26 @@ static void print_cpu_stall_info(int cpu)
 	       fast_no_hz);
 }
 
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+/* Complain about starvation of grace-period kthread.  */
+static void rcu_check_gp_kthread_starvation(void)
 {
-	rdp->ticks_this_gp = 0;
-	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
-	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+	struct task_struct *gpk = rcu_state.gp_kthread;
+	unsigned long j;
+
+	j = jiffies - READ_ONCE(rcu_state.gp_activity);
+	if (j > 2 * HZ) {
+		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+		       rcu_state.name, j,
+		       (long)rcu_seq_current(&rcu_state.gp_seq),
+		       READ_ONCE(rcu_state.gp_flags),
+		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+		if (gpk) {
+			pr_err("RCU grace-period kthread stack dump:\n");
+			sched_show_task(gpk);
+			wake_up_process(gpk);
+		}
+	}
 }
 
 static void print_other_cpu_stall(unsigned long gp_seq)
@@ -488,17 +516,3 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		print_other_cpu_stall(gs2);
 	}
 }
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
-	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
-- 
cgit v1.2.3


From 7ac1907c9e7ba5f80d3c298fe9d2dbf620566a49 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Mon, 14 Jan 2019 10:19:20 -0800
Subject: rcu: Move irq-disabled stall-warning checking to tree_stall.h

The rcu_iw_handler() function's sole purpose in life is to indicate
whether a stalled CPU had interrupts disabled, so it belongs in
kernel/rcu/tree_stall.h.  This commit therefore makes that move,
clarifying its header comment while in the area.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c       | 21 ---------------------
 kernel/rcu/tree.h       |  1 +
 kernel/rcu/tree_stall.h | 20 ++++++++++++++++++++
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 001dd05f6e38..929531ed168c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1031,27 +1031,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 	return 0;
 }
 
-/*
- * Handler for the irq_work request posted when a grace period has
- * gone on for too long, but not yet long enough for an RCU CPU
- * stall warning.  Set state appropriately, but just complain if
- * there is unexpected state on entry.
- */
-static void rcu_iw_handler(struct irq_work *iwp)
-{
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-
-	rdp = container_of(iwp, struct rcu_data, rcu_iw);
-	rnp = rdp->mynode;
-	raw_spin_lock_rcu_node(rnp);
-	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
-		rdp->rcu_iw_gp_seq = rnp->gp_seq;
-		rdp->rcu_iw_pending = false;
-	}
-	raw_spin_unlock_rcu_node(rnp);
-}
-
 /*
  * Return true if the specified CPU has passed through a quiescent
  * state by virtue of being in or having passed through an dynticks
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 099410dbcbe9..f882ce3ca5a5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -443,4 +443,5 @@ static void rcu_dynticks_task_exit(void);
 
 /* Forward declarations for tree_stall.h */
 static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
 static void check_cpu_stall(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 03ed47883d8a..526e223e41ce 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -139,6 +139,26 @@ static void rcu_stall_kick_kthreads(void)
 	}
 }
 
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling.  Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	rdp = container_of(iwp, struct rcu_data, rcu_iw);
+	rnp = rdp->mynode;
+	raw_spin_lock_rcu_node(rnp);
+	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+		rdp->rcu_iw_gp_seq = rnp->gp_seq;
+		rdp->rcu_iw_pending = false;
+	}
+	raw_spin_unlock_rcu_node(rnp);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // Printing RCU CPU stall warnings
-- 
cgit v1.2.3


From b51bcbbf16ef0ea352e8b924dd8638112e4037a5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 15 Jan 2019 07:01:33 -0800
Subject: rcu: Move forward-progress checkers into tree_stall.h

This commit further consolidates stall-warning functionality by moving
forward-progress checkers into kernel/rcu/tree_stall.h, updating a
comment or two while in the area.  More specifically, this commit moves
show_rcu_gp_kthreads(), rcu_check_gp_start_stall(), rcu_fwd_progress_check(),
sysrq_rcu, sysrq_show_rcu(), sysrq_rcudump_op, and rcu_sysrq_init() from
kernel/rcu/tree.c to kernel/rcu/tree_stall.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree.c       | 166 ----------------------------------------------
 kernel/rcu/tree.h       |   2 +
 kernel/rcu/tree_stall.h | 171 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 166 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 929531ed168c..4cb1ebd93b0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,9 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
 int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* Commandeer a sysrq key to dump RCU's tree. */
-static bool sysrq_rcu;
-module_param(sysrq_rcu, bool, 0444);
 
 /*
  * The rcu_scheduler_active variable is initialized to the value
@@ -510,74 +507,6 @@ static const char *gp_state_getname(short gs)
 	return gp_state_names[gs];
 }
 
-/*
- * Show the state of the grace-period kthreads.
- */
-void show_rcu_gp_kthreads(void)
-{
-	int cpu;
-	unsigned long j;
-	unsigned long ja;
-	unsigned long jr;
-	unsigned long jw;
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-
-	j = jiffies;
-	ja = j - READ_ONCE(rcu_state.gp_activity);
-	jr = j - READ_ONCE(rcu_state.gp_req_activity);
-	jw = j - READ_ONCE(rcu_state.gp_wake_time);
-	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
-		rcu_state.name, gp_state_getname(rcu_state.gp_state),
-		rcu_state.gp_state,
-		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
-		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
-		(long)READ_ONCE(rcu_state.gp_seq),
-		(long)READ_ONCE(rcu_get_root()->gp_seq_needed),
-		READ_ONCE(rcu_state.gp_flags));
-	rcu_for_each_node_breadth_first(rnp) {
-		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
-			continue;
-		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
-			rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
-			(long)rnp->gp_seq_needed);
-		if (!rcu_is_leaf_node(rnp))
-			continue;
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
-			rdp = per_cpu_ptr(&rcu_data, cpu);
-			if (rdp->gpwrap ||
-			    ULONG_CMP_GE(rcu_state.gp_seq,
-					 rdp->gp_seq_needed))
-				continue;
-			pr_info("\tcpu %d ->gp_seq_needed %ld\n",
-				cpu, (long)rdp->gp_seq_needed);
-		}
-	}
-	/* sched_show_task(rcu_state.gp_kthread); */
-}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-
-/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
-{
-	show_rcu_gp_kthreads();
-}
-
-static struct sysrq_key_op sysrq_rcudump_op = {
-	.handler = sysrq_show_rcu,
-	.help_msg = "show-rcu(y)",
-	.action_msg = "Show RCU tree",
-	.enable_mask = SYSRQ_ENABLE_DUMP,
-};
-
-static int __init rcu_sysrq_init(void)
-{
-	if (sysrq_rcu)
-		return register_sysrq_key('y', &sysrq_rcudump_op);
-	return 0;
-}
-early_initcall(rcu_sysrq_init);
-
 /*
  * Send along grace-period-related data for rcutorture diagnostics.
  */
@@ -2323,101 +2252,6 @@ void rcu_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 
-/*
- * This function checks for grace-period requests that fail to motivate
- * RCU to come out of its idle mode.
- */
-void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
-			 const unsigned long gpssdelay)
-{
-	unsigned long flags;
-	unsigned long j;
-	struct rcu_node *rnp_root = rcu_get_root();
-	static atomic_t warned = ATOMIC_INIT(0);
-
-	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
-	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
-		return;
-	j = jiffies; /* Expensive access, and in common case don't get here. */
-	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
-	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
-	    atomic_read(&warned))
-		return;
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	j = jiffies;
-	if (rcu_gp_in_progress() ||
-	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
-	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
-	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
-	    atomic_read(&warned)) {
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		return;
-	}
-	/* Hold onto the leaf lock to make others see warned==1. */
-
-	if (rnp_root != rnp)
-		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
-	j = jiffies;
-	if (rcu_gp_in_progress() ||
-	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
-	    time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
-	    time_before(j, rcu_state.gp_activity + gpssdelay) ||
-	    atomic_xchg(&warned, 1)) {
-		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		return;
-	}
-	WARN_ON(1);
-	if (rnp_root != rnp)
-		raw_spin_unlock_rcu_node(rnp_root);
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	show_rcu_gp_kthreads();
-}
-
-/*
- * Do a forward-progress check for rcutorture.  This is normally invoked
- * due to an OOM event.  The argument "j" gives the time period during
- * which rcutorture would like progress to have been made.
- */
-void rcu_fwd_progress_check(unsigned long j)
-{
-	unsigned long cbs;
-	int cpu;
-	unsigned long max_cbs = 0;
-	int max_cpu = -1;
-	struct rcu_data *rdp;
-
-	if (rcu_gp_in_progress()) {
-		pr_info("%s: GP age %lu jiffies\n",
-			__func__, jiffies - rcu_state.gp_start);
-		show_rcu_gp_kthreads();
-	} else {
-		pr_info("%s: Last GP end %lu jiffies ago\n",
-			__func__, jiffies - rcu_state.gp_end);
-		preempt_disable();
-		rdp = this_cpu_ptr(&rcu_data);
-		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
-		preempt_enable();
-	}
-	for_each_possible_cpu(cpu) {
-		cbs = rcu_get_n_cbs_cpu(cpu);
-		if (!cbs)
-			continue;
-		if (max_cpu < 0)
-			pr_info("%s: callbacks", __func__);
-		pr_cont(" %d: %lu", cpu, cbs);
-		if (cbs <= max_cbs)
-			continue;
-		max_cbs = cbs;
-		max_cpu = cpu;
-	}
-	if (max_cpu >= 0)
-		pr_cont("\n");
-}
-EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
-
 /* Perform RCU core processing work for the current CPU.  */
 static __latent_entropy void rcu_core(struct softirq_action *unused)
 {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f882ce3ca5a5..e253d11af3c4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -445,3 +445,5 @@ static void rcu_dynticks_task_exit(void);
 static void record_gp_stall_check_time(void);
 static void rcu_iw_handler(struct irq_work *iwp);
 static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+				     const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 526e223e41ce..9e3db08d02bc 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -536,3 +536,174 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		print_other_cpu_stall(gs2);
 	}
 }
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including of callback invocation.
+
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+	int cpu;
+	unsigned long j;
+	unsigned long ja;
+	unsigned long jr;
+	unsigned long jw;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	j = jiffies;
+	ja = j - READ_ONCE(rcu_state.gp_activity);
+	jr = j - READ_ONCE(rcu_state.gp_req_activity);
+	jw = j - READ_ONCE(rcu_state.gp_wake_time);
+	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+		rcu_state.name, gp_state_getname(rcu_state.gp_state),
+		rcu_state.gp_state,
+		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+		(long)READ_ONCE(rcu_state.gp_seq),
+		(long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+		READ_ONCE(rcu_state.gp_flags));
+	rcu_for_each_node_breadth_first(rnp) {
+		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+			continue;
+		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+			rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+			(long)rnp->gp_seq_needed);
+		if (!rcu_is_leaf_node(rnp))
+			continue;
+		for_each_leaf_node_possible_cpu(rnp, cpu) {
+			rdp = per_cpu_ptr(&rcu_data, cpu);
+			if (rdp->gpwrap ||
+			    ULONG_CMP_GE(rcu_state.gp_seq,
+					 rdp->gp_seq_needed))
+				continue;
+			pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+				cpu, (long)rdp->gp_seq_needed);
+		}
+	}
+	/* sched_show_task(rcu_state.gp_kthread); */
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+				     const unsigned long gpssdelay)
+{
+	unsigned long flags;
+	unsigned long j;
+	struct rcu_node *rnp_root = rcu_get_root();
+	static atomic_t warned = ATOMIC_INIT(0);
+
+	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+		return;
+	j = jiffies; /* Expensive access, and in common case don't get here. */
+	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+	    atomic_read(&warned))
+		return;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	j = jiffies;
+	if (rcu_gp_in_progress() ||
+	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+	    atomic_read(&warned)) {
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		return;
+	}
+	/* Hold onto the leaf lock to make others see warned==1. */
+
+	if (rnp_root != rnp)
+		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+	j = jiffies;
+	if (rcu_gp_in_progress() ||
+	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+	    time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
+	    time_before(j, rcu_state.gp_activity + gpssdelay) ||
+	    atomic_xchg(&warned, 1)) {
+		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		return;
+	}
+	WARN_ON(1);
+	if (rnp_root != rnp)
+		raw_spin_unlock_rcu_node(rnp_root);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	show_rcu_gp_kthreads();
+}
+
+/*
+ * Do a forward-progress check for rcutorture.  This is normally invoked
+ * due to an OOM event.  The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+	unsigned long cbs;
+	int cpu;
+	unsigned long max_cbs = 0;
+	int max_cpu = -1;
+	struct rcu_data *rdp;
+
+	if (rcu_gp_in_progress()) {
+		pr_info("%s: GP age %lu jiffies\n",
+			__func__, jiffies - rcu_state.gp_start);
+		show_rcu_gp_kthreads();
+	} else {
+		pr_info("%s: Last GP end %lu jiffies ago\n",
+			__func__, jiffies - rcu_state.gp_end);
+		preempt_disable();
+		rdp = this_cpu_ptr(&rcu_data);
+		rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+		preempt_enable();
+	}
+	for_each_possible_cpu(cpu) {
+		cbs = rcu_get_n_cbs_cpu(cpu);
+		if (!cbs)
+			continue;
+		if (max_cpu < 0)
+			pr_info("%s: callbacks", __func__);
+		pr_cont(" %d: %lu", cpu, cbs);
+		if (cbs <= max_cbs)
+			continue;
+		max_cbs = cbs;
+		max_cpu = cpu;
+	}
+	if (max_cpu >= 0)
+		pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+	show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+	.handler = sysrq_show_rcu,
+	.help_msg = "show-rcu(y)",
+	.action_msg = "Show RCU tree",
+	.enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+	if (sysrq_rcu)
+		return register_sysrq_key('y', &sysrq_rcudump_op);
+	return 0;
+}
+early_initcall(rcu_sysrq_init);
-- 
cgit v1.2.3


From 6c70e9cd5f3c6d93f3e1da6d101073e898f39170 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Fri, 8 Mar 2019 11:57:48 -0800
Subject: rcu: Fix nohz status in stall warning

The Documentation/RCU/stallwarn.txt file says that stall warnings
print "D" if dyntick-idle processing is enabled, but the code in
print_cpu_stall_fast_no_hz() prints "." instead.  This commit therefore
reverses the sense of the test to make the code match the documentation.

Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/tree_stall.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 9e3db08d02bc..f65a73a97323 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -267,7 +267,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 		rdp->last_accelerate & 0xffff, jiffies & 0xffff,
 		".l"[rdp->all_lazy],
 		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
-		".D"[!rdp->tick_nohz_enabled_snap]);
+		".D"[!!rdp->tick_nohz_enabled_snap]);
 }
 
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-- 
cgit v1.2.3


From 24aca4aea4f0179e0e56cf9ec610c27d07702945 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 22 Jan 2019 19:23:00 -0800
Subject: torture: Don't try to offline the last CPU

If there is only one online CPU, it doesn't make sense to try to offline
it, as any such attempt is guaranteed to fail.  This commit therefore
check for this condition and refuses to attempt the nonsensical.

Reported-by: Su Yue <suy.fnst@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Tested-By: Su Yue <suy.fnst@cn.fujitsu.com>
---
 kernel/torture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index 8faa1a9aaeb9..17b2be9bde12 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
 
 	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
 		return false;
+	if (num_online_cpus() <= 1)
+		return false;  /* Can't offline the last CPU. */
 
 	if (verbose > 1)
 		pr_alert("%s" TORTURE_FLAG
-- 
cgit v1.2.3


From a3b0e1e59ef1757488ef05b66bc376eaf7b06ada Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 28 Feb 2019 15:06:13 -0800
Subject: rcutorture: Make rcutorture_extend_mask() comment match the code

The code actually rarely uses more than one type of RCU read-side
protection, as is actually desired given that we need some reasonable
probability of preempting RCU read-side critical sections, which cannot
happen with multiple types of protection.  This comment therefore adjusts
the comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..2453229ba15a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1160,7 +1160,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
 	unsigned long randmask2 = randmask1 >> 3;
 
 	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
-	/* Most of the time lots of bits, half the time only one bit. */
+	/* Mostly only one bit (need preemption!), sometimes lots of bits. */
 	if (!(randmask1 & 0x7))
 		mask = mask & randmask2;
 	else
-- 
cgit v1.2.3


From f47cb1bb0da23162f4c17e0c0023df4889ecb492 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 28 Feb 2019 15:59:43 -0800
Subject: rcutorture: Remove ->ext_irq_conflict field

Back when there was a separate RCU-bh flavor, the ->ext_irq_conflict
field was used to prevent executing local_bh_enable() while interrupts
were disabled.  However, there is no longer an RCU-bh flavor, so this
commit removes the no-longer-needed ->ext_irq_conflict field.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2453229ba15a..21ab3c7eb221 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,7 +299,6 @@ struct rcu_torture_ops {
 	int irq_capable;
 	int can_boost;
 	int extendables;
-	int ext_irq_conflict;
 	const char *name;
 };
 
@@ -1170,10 +1169,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
 	    ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
 	     (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
 		mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
-	if ((mask & RCUTORTURE_RDR_IRQ) &&
-	    !(mask & cur_ops->ext_irq_conflict) &&
-	    (oldmask & cur_ops->ext_irq_conflict))
-		mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
 	return mask ?: RCUTORTURE_RDR_RCU;
 }
 
-- 
cgit v1.2.3


From d44ac1bebc47e46d62019808a893582f56496a98 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Sat, 9 Mar 2019 00:40:45 +0530
Subject: rcutorture: Fix expected forward progress duration in OOM notifier

The rcutorture_oom_notify() function has a misplaced close parenthesis
that results in increasingly long delays in rcu_fwd_progress_check()'s
checking for various RCU forward-progress problems.  This commit therefore
puts the parenthesis in the right place.

Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 21ab3c7eb221..b42682b94cb7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1843,7 +1843,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
 	rcu_torture_fwd_cb_hist();
-	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
 	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
 	pr_info("%s: Freed %lu RCU callbacks.\n",
-- 
cgit v1.2.3


From b813afae7ab6a5e91b4e16cc567331d9c2ae1f04 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 21 Mar 2019 09:27:28 -0700
Subject: rcutorture: Fix cleanup path for invalid torture_type strings

If the specified rcutorture.torture_type is not in the rcu_torture_init()
function's torture_ops[] array, rcutorture prints some console messages
and then invokes rcu_torture_cleanup() to set state so that a future
torture test can run.  However, rcu_torture_cleanup() also attempts to
end the test that didn't actually start, and in doing so relies on the
value of cur_ops, a value that is not particularly relevant in this case.
This can result in confusing output or even follow-on failures due to
attempts to use facilities that have not been properly initialized.

This commit therefore sets the value of cur_ops to NULL in this case
and inserts a check near the beginning of rcu_torture_cleanup(),
thus avoiding relying on an irrelevant cur_ops value.

Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcutorture.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b42682b94cb7..e3c0f57ab0aa 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2089,6 +2089,10 @@ rcu_torture_cleanup(void)
 			cur_ops->cb_barrier();
 		return;
 	}
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
 
 	rcu_torture_barrier_cleanup();
 	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2262,6 +2266,7 @@ rcu_torture_init(void)
 		pr_cont("\n");
 		WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
 		firsterr = -EINVAL;
+		cur_ops = NULL;
 		goto unwind;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
-- 
cgit v1.2.3


From ad092c027713a68a34168942a5ef422e42e039f4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 21 Mar 2019 10:26:41 -0700
Subject: rcuperf: Fix cleanup path for invalid perf_type strings

If the specified rcuperf.perf_type is not in the rcu_perf_init()
function's perf_ops[] array, rcuperf prints some console messages and
then invokes rcu_perf_cleanup() to set state so that a future torture
test can run.  However, rcu_perf_cleanup() also attempts to end the
test that didn't actually start, and in doing so relies on the value
of cur_ops, a value that is not particularly relevant in this case.
This can result in confusing output or even follow-on failures due to
attempts to use facilities that have not been properly initialized.

This commit therefore sets the value of cur_ops to NULL in this case and
inserts a check near the beginning of rcu_perf_cleanup(), thus avoiding
relying on an irrelevant cur_ops value.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
---
 kernel/rcu/rcuperf.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c29761152874..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void)
 
 	if (torture_cleanup_begin())
 		return;
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
@@ -614,6 +618,7 @@ rcu_perf_init(void)
 		pr_cont("\n");
 		WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
 		firsterr = -EINVAL;
+		cur_ops = NULL;
 		goto unwind;
 	}
 	if (cur_ops->init)
-- 
cgit v1.2.3


From a9d6938ddb7f892552013b93e4842fc1a538628d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Thu, 21 Mar 2019 13:30:01 -0700
Subject: locktorture: NULL cxt.lwsa and cxt.lrsa to allow bad-arg detection

Currently, lock_torture_cleanup() uses the values of cxt.lwsa and cxt.lrsa
to detect bad parameters that prevented locktorture from initializing,
let alone running.  In this case, lock_torture_cleanup() does no cleanup
aside from invoking torture_cleanup_begin() and torture_cleanup_end(),
as required to permit future torture tests to run.  However, this
heuristic fails if the run with bad parameters was preceded by a previous
run that actually ran:  In this case, both cxt.lwsa and cxt.lrsa will
remain non-zero, which means that the current lock_torture_cleanup()
invocation will be unable to detect the fact that it should skip cleanup,
which can result in charming outcomes such as double frees.

This commit therefore NULLs out both cxt.lwsa and cxt.lrsa at the end
of any run that actually ran.

Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/locktorture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ad40a2617063..80a463d31a8d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)
 						"End of test: SUCCESS");
 
 	kfree(cxt.lwsa);
+	cxt.lwsa = NULL;
 	kfree(cxt.lrsa);
+	cxt.lrsa = NULL;
 
 end:
 	torture_cleanup_end();
-- 
cgit v1.2.3


From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Sun, 27 Jan 2019 19:03:45 -0500
Subject: x86/ima: require signed kernel modules

Have the IMA architecture specific policy require signed kernel modules
on systems with secure boot mode enabled; and coordinate the different
signature verification methods, so only one signature is required.

Requiring appended kernel module signatures may be configured, enabled
on the boot command line, or with this patch enabled in secure boot
mode.  This patch defines set_module_sig_enforced().

To coordinate between appended kernel module signatures and IMA
signatures, only define an IMA MODULE_CHECK policy rule if
CONFIG_MODULE_SIG is not enabled.  A custom IMA policy may still define
and require an IMA signature.

Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..985caa467aef 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -286,6 +286,11 @@ bool is_module_sig_enforced(void)
 }
 EXPORT_SYMBOL(is_module_sig_enforced);
 
+void set_module_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.2.3


From 7a8e61f8478639072d402a26789055a4a4de8f77 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Mar 2019 11:36:19 +0100
Subject: timekeeping: Force upper bound for setting CLOCK_REALTIME

Several people reported testing failures after setting CLOCK_REALTIME close
to the limits of the kernel internal representation in nanoseconds,
i.e. year 2262.

The failures are exposed in subsequent operations, i.e. when arming timers
or when the advancing CLOCK_MONOTONIC makes the calculation of
CLOCK_REALTIME overflow into negative space.

Now people start to paper over the underlying problem by clamping
calculations to the valid range, but that's just wrong because such
workarounds will prevent detection of real issues as well.

It is reasonable to force an upper bound for the various methods of setting
CLOCK_REALTIME. Year 2262 is the absolute upper bound. Assume a maximum
uptime of 30 years which is plenty enough even for esoteric embedded
systems. That results in an upper bound of year 2232 for setting the time.

Once that limit is reached in reality this limit is only a small part of
the problem space. But until then this stops people from trying to paper
over the problem at the wrong places.

Reported-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Reported-by: Hongbo Yao <yaohongbo@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903231125480.2157@nanos.tec.linutronix.de
---
 kernel/time/time.c        | 2 +-
 kernel/time/timekeeping.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index c3f756f8534b..86656bbac232 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 	static int firsttime = 1;
 	int error = 0;
 
-	if (tv && !timespec64_valid(tv))
+	if (tv && !timespec64_valid_settod(tv))
 		return -EINVAL;
 
 	error = security_settime64(tv, tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 540145da33da..5716e28bfa3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts)
 	unsigned long flags;
 	int ret = 0;
 
-	if (!timespec64_valid_strict(ts))
+	if (!timespec64_valid_settod(ts))
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
 	/* Make sure the proposed value is valid */
 	tmp = timespec64_add(tk_xtime(tk), *ts);
 	if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
-	    !timespec64_valid_strict(&tmp)) {
+	    !timespec64_valid_settod(&tmp)) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -1527,7 +1527,7 @@ void __init timekeeping_init(void)
 	unsigned long flags;
 
 	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
-	if (timespec64_valid_strict(&wall_time) &&
+	if (timespec64_valid_settod(&wall_time) &&
 	    timespec64_to_ns(&wall_time) > 0) {
 		persistent_clock_exists = true;
 	} else if (timespec64_to_ns(&wall_time) != 0) {
-- 
cgit v1.2.3


From 1c7651f43777cdd59c1aaa82c87324d3e7438c7b Mon Sep 17 00:00:00 2001
From: Eugene Loh <eugene.loh@oracle.com>
Date: Mon, 25 Feb 2019 11:59:58 -0800
Subject: kallsyms: store type information in its own array

When a module is loaded, its symbols' Elf_Sym information is stored
in a symtab.  Further, type information is also captured.  Since
Elf_Sym has no type field, historically the st_info field has been
hijacked for storing type:  st_info was overwritten.

commit 5439c985c5a83a8419f762115afdf560ab72a452 ("module: Overwrite
st_size instead of st_info") changes that practice, as its one-liner
indicates.  Unfortunately, this change overwrites symbol size,
information that a tool like DTrace expects to find.

Allocate a typetab array to store type information so that no Elf_Sym
field needs to be overwritten.

Fixes: 5439c985c5a8 ("module: Overwrite st_size instead of st_info")
Signed-off-by: Eugene Loh <eugene.loh@oracle.com>
Reviewed-by: Nick Alcock <nick.alcock@oracle.com>
[jeyu: renamed typeoff -> typeoffs ]
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module-internal.h |  2 +-
 kernel/module.c          | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 79c9be2dbbe9..d354341f8cc0 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -20,7 +20,7 @@ struct load_info {
 	unsigned long len;
 	Elf_Shdr *sechdrs;
 	char *secstrings, *strtab;
-	unsigned long symoffs, stroffs;
+	unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
 	struct _ddebug *debug;
 	unsigned int num_debug;
 	bool sig_ok;
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..69e52e82242a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2647,6 +2647,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
 	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
 	mod->core_layout.size += strtab_size;
+	info->core_typeoffs = mod->core_layout.size;
+	mod->core_layout.size += ndst * sizeof(char);
 	mod->core_layout.size = debug_align(mod->core_layout.size);
 
 	/* Put string table section at end of init part of module. */
@@ -2660,6 +2662,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 				      __alignof__(struct mod_kallsyms));
 	info->mod_kallsyms_init_off = mod->init_layout.size;
 	mod->init_layout.size += sizeof(struct mod_kallsyms);
+	info->init_typeoffs = mod->init_layout.size;
+	mod->init_layout.size += nsrc * sizeof(char);
 	mod->init_layout.size = debug_align(mod->init_layout.size);
 }
 
@@ -2683,20 +2687,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
 	/* Make sure we get permanent strtab: don't use info->strtab. */
 	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
 
-	/* Set types up while we still have access to sections. */
-	for (i = 0; i < mod->kallsyms->num_symtab; i++)
-		mod->kallsyms->symtab[i].st_size
-			= elf_type(&mod->kallsyms->symtab[i], info);
-
-	/* Now populate the cut down core kallsyms for after init. */
+	/*
+	 * Now populate the cut down core kallsyms for after init
+	 * and set types up while we still have access to sections.
+	 */
 	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
 	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
 	src = mod->kallsyms->symtab;
 	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+		mod->kallsyms->typetab[i] = elf_type(src + i, info);
 		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
+			mod->core_kallsyms.typetab[ndst] =
+			    mod->kallsyms->typetab[i];
 			dst[ndst] = src[i];
 			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
 			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -4080,7 +4087,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			const Elf_Sym *sym = &kallsyms->symtab[symnum];
 
 			*value = kallsyms_symbol_value(sym);
-			*type = sym->st_size;
+			*type = kallsyms->typetab[symnum];
 			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, *value, mod);
-- 
cgit v1.2.3


From aba0954327c831f593702e3a81ef3ad4bec7a838 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 29 Mar 2019 11:28:52 +0100
Subject: tick/broadcast: Fix warning about undefined
 tick_broadcast_oneshot_offline()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Randconfig builds with

  CONFIG_TICK_ONESHOT=y
  CONFIG_HOTPLUG_CPU=n

trigger

  kernel/time/tick-broadcast.c:39:13: warning: ‘tick_broadcast_oneshot_offline’ \
	  declared ‘static’ but never defined [-Wunused-function]

due to that function's definition missing.

Move the CONFIG_HOTPLUG_CPU ifdeffery around its declaration too.

Fixes: 1b72d4323798 ("tick: Remove outgoing CPU from broadcast masks")
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: x86@kernel.org
Link: https://lkml.kernel.org/r/20190329110508.6621-1-bp@alien8.de
---
 kernel/time/tick-broadcast.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0283523de045..7541cbca695e 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,12 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
 static void tick_broadcast_clear_oneshot(int cpu);
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+# ifdef CONFIG_HOTPLUG_CPU
 static void tick_broadcast_oneshot_offline(unsigned int cpu);
+# endif
 #else
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
 static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+# ifdef CONFIG_HOTPLUG_CPU
 static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
+# endif
 #endif
 
 /*
-- 
cgit v1.2.3


From 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 28 Mar 2019 18:01:57 -0700
Subject: bpf: Support variable offset stack access from helpers

Currently there is a difference in how verifier checks memory access for
helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to
variable part of offset.

check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable
offsets just fine, so that BPF program can call a helper like this:

  some_helper(map_value_ptr + off, size);

, where offset is unknown at load time, but is checked by program to be
in a safe rage (off >= 0 && off + size < map_value_size).

But it's not the case for check_stack_boundary, that is used for
PTR_TO_STACK, and same code with pointer to stack is rejected by
verifier:

  some_helper(stack_value_ptr + off, size);

For example:
  0: (7a) *(u64 *)(r10 -16) = 0
  1: (7a) *(u64 *)(r10 -8) = 0
  2: (61) r2 = *(u32 *)(r1 +0)
  3: (57) r2 &= 4
  4: (17) r2 -= 16
  5: (0f) r2 += r10
  6: (18) r1 = 0xffff888111343a80
  8: (85) call bpf_map_lookup_elem#1
  invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4)

Add support for variable offset access to check_stack_boundary so that
if offset is checked by program to be in a safe range it's accepted by
verifier.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 75 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2fe89138309a..87221fda1321 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2157,6 +2157,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 				BPF_SIZE(insn->code), BPF_WRITE, -1, true);
 }
 
+static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
+				  int off, int access_size,
+				  bool zero_size_allowed)
+{
+	struct bpf_reg_state *reg = reg_state(env, regno);
+
+	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+		if (tnum_is_const(reg->var_off)) {
+			verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
+				regno, off, access_size);
+		} else {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
+				regno, tn_buf, access_size);
+		}
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* when register 'regno' is passed into function that will read 'access_size'
  * bytes from that pointer, make sure that it's within stack boundary
  * and all elements of stack are initialized.
@@ -2169,7 +2192,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 {
 	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_func_state *state = func(env, reg);
-	int off, i, slot, spi;
+	int err, min_off, max_off, i, slot, spi;
 
 	if (reg->type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -2183,21 +2206,23 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		return -EACCES;
 	}
 
-	/* Only allow fixed-offset stack reads */
-	if (!tnum_is_const(reg->var_off)) {
-		char tn_buf[48];
-
-		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "invalid variable stack read R%d var_off=%s\n",
-			regno, tn_buf);
-		return -EACCES;
-	}
-	off = reg->off + reg->var_off.value;
-	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
-		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
-			regno, off, access_size);
-		return -EACCES;
+	if (tnum_is_const(reg->var_off)) {
+		min_off = max_off = reg->var_off.value + reg->off;
+		err = __check_stack_boundary(env, regno, min_off, access_size,
+					     zero_size_allowed);
+		if (err)
+			return err;
+	} else {
+		min_off = reg->smin_value + reg->off;
+		max_off = reg->umax_value + reg->off;
+		err = __check_stack_boundary(env, regno, min_off, access_size,
+					     zero_size_allowed);
+		if (err)
+			return err;
+		err = __check_stack_boundary(env, regno, max_off, access_size,
+					     zero_size_allowed);
+		if (err)
+			return err;
 	}
 
 	if (meta && meta->raw_mode) {
@@ -2206,10 +2231,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		return 0;
 	}
 
-	for (i = 0; i < access_size; i++) {
+	for (i = min_off; i < max_off + access_size; i++) {
 		u8 *stype;
 
-		slot = -(off + i) - 1;
+		slot = -i - 1;
 		spi = slot / BPF_REG_SIZE;
 		if (state->allocated_stack <= slot)
 			goto err;
@@ -2222,8 +2247,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 			goto mark;
 		}
 err:
-		verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
-			off, i, access_size);
+		if (tnum_is_const(reg->var_off)) {
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+				min_off, i - min_off, access_size);
+		} else {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
+				tn_buf, i - min_off, access_size);
+		}
 		return -EACCES;
 mark:
 		/* reading any byte out of 8-byte 'spill_slot' will cause
@@ -2232,7 +2265,7 @@ mark:
 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
 			      state->stack[spi].spilled_ptr.parent);
 	}
-	return update_stack_depth(env, state, off);
+	return update_stack_depth(env, state, min_off);
 }
 
 static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
-- 
cgit v1.2.3


From b5dee3130bb4014511f5d0dd46855ed843e3fdc8 Mon Sep 17 00:00:00 2001
From: Harry Pan <harry.pan@intel.com>
Date: Mon, 25 Feb 2019 20:36:41 +0800
Subject: PM / sleep: Refactor filesystems sync to reduce duplication

Create a common helper to sync filesystems for system suspend and
hibernation.

Signed-off-by: Harry Pan <harry.pan@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c |  5 +----
 kernel/power/main.c      |  9 +++++++++
 kernel/power/suspend.c   | 13 +++++--------
 kernel/power/user.c      |  5 +----
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index abef759de7c8..cc105ecd9c07 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,7 +14,6 @@
 
 #include <linux/export.h>
 #include <linux/suspend.h>
-#include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
@@ -709,9 +708,7 @@ int hibernate(void)
 		goto Exit;
 	}
 
-	pr_info("Syncing filesystems ... \n");
-	ksys_sync();
-	pr_info("done.\n");
+	ksys_sync_helper();
 
 	error = freeze_processes();
 	if (error)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 98e76cad128b..40472a7c5536 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
+#include <linux/syscalls.h>
 
 #include "power.h"
 
@@ -51,6 +52,14 @@ void unlock_system_sleep(void)
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
 
+void ksys_sync_helper(void)
+{
+	pr_info("Syncing filesystems ... ");
+	ksys_sync();
+	pr_cont("done.\n");
+}
+EXPORT_SYMBOL_GPL(ksys_sync_helper);
+
 /* Routines for PM-transition notifications */
 
 static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0bd595a0b610..e39059dea38b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -17,7 +17,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
-#include <linux/syscalls.h>
 #include <linux/gfp.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state)
 	if (state == PM_SUSPEND_TO_IDLE)
 		s2idle_begin();
 
-#ifndef CONFIG_SUSPEND_SKIP_SYNC
-	trace_suspend_resume(TPS("sync_filesystems"), 0, true);
-	pr_info("Syncing filesystems ... ");
-	ksys_sync();
-	pr_cont("done.\n");
-	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-#endif
+	if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
+		trace_suspend_resume(TPS("sync_filesystems"), 0, true);
+		ksys_sync_helper();
+		trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+	}
 
 	pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
 	pm_suspend_clear_flags();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 2d8b60a3c86b..cb24e840a3e6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/suspend.h>
-#include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
@@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		if (data->frozen)
 			break;
 
-		printk("Syncing filesystems ... ");
-		ksys_sync();
-		printk("done.\n");
+		ksys_sync_helper();
 
 		error = freeze_processes();
 		if (error)
-- 
cgit v1.2.3


From c64546b17bc940643545dd34eac21f51764d633c Mon Sep 17 00:00:00 2001
From: Harry Pan <harry.pan@intel.com>
Date: Mon, 25 Feb 2019 20:36:43 +0800
Subject: PM / sleep: Measure the time of filesystems syncing

Measure the filesystems sync time during system sleep more precisely.

Among other things, this allows the pr_cont() to be dropped from
ksys_sync_helper() and makes automatic system suspend and hibernation
profiling somewhat more straightforward.

Signed-off-by: Harry Pan <harry.pan@intel.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/main.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 40472a7c5536..4f43e724f6eb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -54,9 +54,14 @@ EXPORT_SYMBOL_GPL(unlock_system_sleep);
 
 void ksys_sync_helper(void)
 {
-	pr_info("Syncing filesystems ... ");
+	ktime_t start;
+	long elapsed_msecs;
+
+	start = ktime_get();
 	ksys_sync();
-	pr_cont("done.\n");
+	elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start));
+	pr_info("Filesystems sync: %ld.%03ld seconds\n",
+		elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC);
 }
 EXPORT_SYMBOL_GPL(ksys_sync_helper);
 
-- 
cgit v1.2.3


From de7b77e5bb9451417ca57f1b6501da654587c048 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 27 Mar 2019 07:00:29 -0500
Subject: cpu/hotplug: Create SMT sysfs interface for all arches

Make the /sys/devices/system/cpu/smt/* files available on all arches, so
user space has a consistent way to detect whether SMT is enabled.

The 'control' file now shows 'notimplemented' for architectures which
don't yet have CONFIG_HOTPLUG_SMT.

[ tglx: Make notimplemented a real state ]

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Link: https://lkml.kernel.org/r/469c2b98055f2c41e75748e06447d592a64080c9.1553635520.git.jpoimboe@redhat.com
---
 kernel/cpu.c | 64 ++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6754f3ecfd94..b8bf3f93e39b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2033,19 +2033,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
 
 #ifdef CONFIG_HOTPLUG_SMT
 
-static const char *smt_states[] = {
-	[CPU_SMT_ENABLED]		= "on",
-	[CPU_SMT_DISABLED]		= "off",
-	[CPU_SMT_FORCE_DISABLED]	= "forceoff",
-	[CPU_SMT_NOT_SUPPORTED]		= "notsupported",
-};
-
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
-}
-
 static void cpuhp_offline_cpu_device(unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
@@ -2116,9 +2103,10 @@ static int cpuhp_smt_enable(void)
 	return ret;
 }
 
+
 static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
-		  const char *buf, size_t count)
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
 {
 	int ctrlval, ret;
 
@@ -2156,14 +2144,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,
 	unlock_device_hotplug();
 	return ret ? ret : count;
 }
+
+#else /* !CONFIG_HOTPLUG_SMT */
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+		    const char *buf, size_t count)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG_SMT */
+
+static const char *smt_states[] = {
+	[CPU_SMT_ENABLED]		= "on",
+	[CPU_SMT_DISABLED]		= "off",
+	[CPU_SMT_FORCE_DISABLED]	= "forceoff",
+	[CPU_SMT_NOT_SUPPORTED]		= "notsupported",
+	[CPU_SMT_NOT_IMPLEMENTED]	= "notimplemented",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	const char *state = smt_states[cpu_smt_control];
+
+	return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+		  const char *buf, size_t count)
+{
+	return __store_smt_control(dev, attr, buf, count);
+}
 static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
 
 static ssize_t
 show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	bool active = topology_max_smt_threads() > 1;
-
-	return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+	return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
 }
 static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
 
@@ -2179,21 +2197,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {
 	NULL
 };
 
-static int __init cpu_smt_state_init(void)
+static int __init cpu_smt_sysfs_init(void)
 {
 	return sysfs_create_group(&cpu_subsys.dev_root->kobj,
 				  &cpuhp_smt_attr_group);
 }
 
-#else
-static inline int cpu_smt_state_init(void) { return 0; }
-#endif
-
 static int __init cpuhp_sysfs_init(void)
 {
 	int cpu, ret;
 
-	ret = cpu_smt_state_init();
+	ret = cpu_smt_sysfs_init();
 	if (ret)
 		return ret;
 
@@ -2214,7 +2228,7 @@ static int __init cpuhp_sysfs_init(void)
 	return 0;
 }
 device_initcall(cpuhp_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
 
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
-- 
cgit v1.2.3


From 40ed29b373381532ef222e509c5aa69a1d8561ea Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sun, 23 Sep 2018 12:11:33 +0000
Subject: ring-buffer: Fix ring buffer size in rb_write_something()

'cnt' should be used to calculate ring buffer size rather than data->cnt

Link: http://lkml.kernel.org/r/1537704693-184237-1-git-send-email-yuehaibing@huawei.com

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41b6f96e5366..4f33d7d841af 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
 	cnt = data->cnt + (nested ? 27 : 0);
 
 	/* Multiply cnt by ~e, to make some unique increment */
-	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+	size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
 
 	len = size + sizeof(struct rb_item);
 
-- 
cgit v1.2.3


From f45d1225adb0479478cee989e2ae2d7d2c62b31b Mon Sep 17 00:00:00 2001
From: Divya Indi <divya.indi@oracle.com>
Date: Wed, 20 Mar 2019 11:28:51 -0700
Subject: tracing: Kernel access to Ftrace instances
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ftrace provides the feature “instances” that provides the capability to
create multiple Ftrace ring buffers. However, currently these buffers
are created/accessed via userspace only. The kernel APIs providing these
features are not exported, hence cannot be used by other kernel
components.

This patch aims to extend this infrastructure to provide the
flexibility to create/log/remove/ enable-disable existing trace events
to these buffers from within the kernel.

Link: http://lkml.kernel.org/r/1553106531-3281-2-git-send-email-divya.indi@oracle.com

Signed-off-by: Divya Indi <divya.indi@oracle.com>
Reviewed-by: Joe Jin <joe.jin@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 74 ++++++++++++++++++++++++++++++---------------
 kernel/trace/trace_events.c |  1 +
 2 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..4384fcc386c8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3053,6 +3053,7 @@ void trace_printk_init_buffers(void)
 	if (global_trace.trace_buffer.buffer)
 		tracing_start_cmdline_record();
 }
+EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
 
 void trace_printk_start_comm(void)
 {
@@ -3213,6 +3214,7 @@ int trace_array_printk(struct trace_array *tr,
 	va_end(ap);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(trace_array_printk);
 
 __printf(3, 4)
 int trace_array_printk_buf(struct ring_buffer *buffer,
@@ -8037,7 +8039,7 @@ static void update_tracer_options(struct trace_array *tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-static int instance_mkdir(const char *name)
+struct trace_array *trace_array_create(const char *name)
 {
 	struct trace_array *tr;
 	int ret;
@@ -8101,7 +8103,7 @@ static int instance_mkdir(const char *name)
 	mutex_unlock(&trace_types_lock);
 	mutex_unlock(&event_mutex);
 
-	return 0;
+	return tr;
 
  out_free_tr:
 	free_trace_buffers(tr);
@@ -8113,33 +8115,21 @@ static int instance_mkdir(const char *name)
 	mutex_unlock(&trace_types_lock);
 	mutex_unlock(&event_mutex);
 
-	return ret;
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(trace_array_create);
 
+static int instance_mkdir(const char *name)
+{
+	return PTR_ERR_OR_ZERO(trace_array_create(name));
 }
 
-static int instance_rmdir(const char *name)
+static int __remove_instance(struct trace_array *tr)
 {
-	struct trace_array *tr;
-	int found = 0;
-	int ret;
 	int i;
 
-	mutex_lock(&event_mutex);
-	mutex_lock(&trace_types_lock);
-
-	ret = -ENODEV;
-	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-		if (tr->name && strcmp(tr->name, name) == 0) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		goto out_unlock;
-
-	ret = -EBUSY;
 	if (tr->ref || (tr->current_trace && tr->current_trace->ref))
-		goto out_unlock;
+		return -EBUSY;
 
 	list_del(&tr->list);
 
@@ -8165,10 +8155,46 @@ static int instance_rmdir(const char *name)
 	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
+	tr = NULL;
 
-	ret = 0;
+	return 0;
+}
+
+int trace_array_destroy(struct trace_array *tr)
+{
+	int ret;
+
+	if (!tr)
+		return -EINVAL;
+
+	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
+
+	ret = __remove_instance(tr);
+
+	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_array_destroy);
+
+static int instance_rmdir(const char *name)
+{
+	struct trace_array *tr;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
+
+	ret = -ENODEV;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr->name && strcmp(tr->name, name) == 0) {
+			ret = __remove_instance(tr);
+			break;
+		}
+	}
 
- out_unlock:
 	mutex_unlock(&trace_types_lock);
 	mutex_unlock(&event_mutex);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5b3b0c3c8a47..81c038ed6cee 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
 
 /**
  * trace_set_clr_event - enable or disable an event
-- 
cgit v1.2.3


From 8a062902be725f647dc8da532b04d836546a369a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 31 Mar 2019 18:48:15 -0500
Subject: tracing: Add tracing error log

Introduce a new ftrace file, tracing/error_log, for ftrace commands to
log errors.  This is useful for allowing more complex commands such as
hist trigger and kprobe_event commands to point out specifically where
something may have gone wrong without forcing them to resort to more
ad hoc methods such as tacking error messages onto existing output
files.

To log a tracing error, call the event_log_err() function, passing it
a location string describing where it came from e.g. kprobe_events or
system:event, the command that caused the error, an array of static
error strings describing errors and an index within that array which
describes the specific error, along with the position to place the
error caret.

Reading the log displays the last (currently) 8 errors logged in the
following format:

  [timestamp] <loc>: error: <static error text>
    Command: <command that caused the error>
                      ^

Memory for the error log isn't allocated unless there has been a trace
event error, and the error log can be cleared and have its memory
freed by writing the empty string in truncation mode to it:

  # echo > tracing/error_log.

Link: http://lkml.kernel.org/r/0c2c82571fd38c5f3a88ca823627edff250e9416.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Improvements-suggested-by: Steve Rostedt <rostedt@goodmis.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |   4 +
 2 files changed, 222 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4384fcc386c8..7978168f5041 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6878,6 +6878,221 @@ static const struct file_operations snapshot_raw_fops = {
 
 #endif /* CONFIG_TRACER_SNAPSHOT */
 
+#define TRACING_LOG_ERRS_MAX	8
+#define TRACING_LOG_LOC_MAX	128
+
+#define CMD_PREFIX "  Command: "
+
+struct err_info {
+	const char	**errs;	/* ptr to loc-specific array of err strings */
+	u8		type;	/* index into errs -> specific err string */
+	u8		pos;	/* MAX_FILTER_STR_VAL = 256 */
+	u64		ts;
+};
+
+struct tracing_log_err {
+	struct list_head	list;
+	struct err_info		info;
+	char			loc[TRACING_LOG_LOC_MAX]; /* err location */
+	char			cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+};
+
+static LIST_HEAD(tracing_err_log);
+static DEFINE_MUTEX(tracing_err_log_lock);
+
+static unsigned int n_tracing_err_log_entries;
+
+struct tracing_log_err *get_tracing_log_err(void)
+{
+	struct tracing_log_err *err;
+
+	if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) {
+		err = kzalloc(sizeof(*err), GFP_KERNEL);
+		if (!err)
+			err = ERR_PTR(-ENOMEM);
+		n_tracing_err_log_entries++;
+
+		return err;
+	}
+
+	err = list_first_entry(&tracing_err_log, struct tracing_log_err, list);
+	list_del(&err->list);
+
+	return err;
+}
+
+/**
+ * err_pos - find the position of a string within a command for error careting
+ * @cmd: The tracing command that caused the error
+ * @str: The string to position the caret at within @cmd
+ *
+ * Finds the position of the first occurence of @str within @cmd.  The
+ * return value can be passed to tracing_log_err() for caret placement
+ * within @cmd.
+ *
+ * Returns the index within @cmd of the first occurence of @str or 0
+ * if @str was not found.
+ */
+unsigned int err_pos(char *cmd, const char *str)
+{
+	char *found;
+
+	if (WARN_ON(!strlen(cmd)))
+		return 0;
+
+	found = strstr(cmd, str);
+	if (found)
+		return found - cmd;
+
+	return 0;
+}
+
+/**
+ * tracing_log_err - write an error to the tracing error log
+ * @loc: A string describing where the error occurred
+ * @cmd: The tracing command that caused the error
+ * @errs: The array of loc-specific static error strings
+ * @type: The index into errs[], which produces the specific static err string
+ * @pos: The position the caret should be placed in the cmd
+ *
+ * Writes an error into tracing/error_log of the form:
+ *
+ * <loc>: error: <text>
+ *   Command: <cmd>
+ *              ^
+ *
+ * tracing/error_log is a small log file containing the last
+ * TRACING_LOG_ERRS_MAX errors (8).  Memory for errors isn't allocated
+ * unless there has been a tracing error, and the error log can be
+ * cleared and have its memory freed by writing the empty string in
+ * truncation mode to it i.e. echo > tracing/error_log.
+ *
+ * NOTE: the @errs array along with the @type param are used to
+ * produce a static error string - this string is not copied and saved
+ * when the error is logged - only a pointer to it is saved.  See
+ * existing callers for examples of how static strings are typically
+ * defined for use with tracing_log_err().
+ */
+void tracing_log_err(const char *loc, const char *cmd,
+		     const char **errs, u8 type, u8 pos)
+{
+	struct tracing_log_err *err;
+
+	mutex_lock(&tracing_err_log_lock);
+	err = get_tracing_log_err();
+	if (PTR_ERR(err) == -ENOMEM) {
+		mutex_unlock(&tracing_err_log_lock);
+		return;
+	}
+
+	snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
+	snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+
+	err->info.errs = errs;
+	err->info.type = type;
+	err->info.pos = pos;
+	err->info.ts = local_clock();
+
+	list_add_tail(&err->list, &tracing_err_log);
+	mutex_unlock(&tracing_err_log_lock);
+}
+
+static void clear_tracing_err_log(void)
+{
+	struct tracing_log_err *err, *next;
+
+	mutex_lock(&tracing_err_log_lock);
+	list_for_each_entry_safe(err, next, &tracing_err_log, list) {
+		list_del(&err->list);
+		kfree(err);
+	}
+
+	n_tracing_err_log_entries = 0;
+	mutex_unlock(&tracing_err_log_lock);
+}
+
+static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&tracing_err_log_lock);
+
+	return seq_list_start(&tracing_err_log, *pos);
+}
+
+static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &tracing_err_log, pos);
+}
+
+static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&tracing_err_log_lock);
+}
+
+static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+{
+	u8 i;
+
+	for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
+		seq_putc(m, ' ');
+	for (i = 0; i < pos; i++)
+		seq_putc(m, ' ');
+	seq_puts(m, "^\n");
+}
+
+static int tracing_err_log_seq_show(struct seq_file *m, void *v)
+{
+	struct tracing_log_err *err = v;
+
+	if (err) {
+		const char *err_text = err->info.errs[err->info.type];
+		u64 sec = err->info.ts;
+		u32 nsec;
+
+		nsec = do_div(sec, NSEC_PER_SEC);
+		seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
+			   err->loc, err_text);
+		seq_printf(m, "%s", err->cmd);
+		tracing_err_log_show_pos(m, err->info.pos);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations tracing_err_log_seq_ops = {
+	.start  = tracing_err_log_seq_start,
+	.next   = tracing_err_log_seq_next,
+	.stop   = tracing_err_log_seq_stop,
+	.show   = tracing_err_log_seq_show
+};
+
+static int tracing_err_log_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	/* If this file was opened for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+		clear_tracing_err_log();
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_open(file, &tracing_err_log_seq_ops);
+
+	return ret;
+}
+
+static ssize_t tracing_err_log_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static const struct file_operations tracing_err_log_fops = {
+	.open           = tracing_err_log_open,
+	.write		= tracing_err_log_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+};
+
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
 	struct trace_array *tr = inode->i_private;
@@ -8284,6 +8499,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 			  tr, &snapshot_fops);
 #endif
 
+	trace_create_file("error_log", 0644, d_tracer,
+			  tr, &tracing_err_log_fops);
+
 	for_each_tracing_cpu(cpu)
 		tracing_init_tracefs_percpu(tr, cpu);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..b711edbef7e7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1884,6 +1884,10 @@ extern ssize_t trace_parse_run_command(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos,
 		int (*createfn)(int, char**));
 
+extern unsigned int err_pos(char *cmd, const char *str);
+extern void tracing_log_err(const char *loc, const char *cmd,
+			    const char **errs, u8 type, u8 pos);
+
 /*
  * Normal trace_printk() and friends allocates special buffers
  * to do the manipulation, as well as saves the print formats
-- 
cgit v1.2.3


From a1a05bb40e229d148c071fcd2ed787b21f61ff8b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 31 Mar 2019 18:48:16 -0500
Subject: tracing: Save the last hist command's associated event name

In preparation for making use of the new trace error log, save the
subsystem and event name associated with the last hist command - it
will be passed as the location param in the event_log_err() calls.

Link: http://lkml.kernel.org/r/eb0fd1362be8f39facb86c83eecf441b7a5876f8.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 795aa2038377..0de702bf148f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -535,15 +535,34 @@ static struct track_data *track_data_alloc(unsigned int key_len,
 	return data;
 }
 
-static char last_hist_cmd[MAX_FILTER_STR_VAL];
+static char last_cmd[MAX_FILTER_STR_VAL];
+static char last_cmd_loc[MAX_FILTER_STR_VAL];
+
 static char hist_err_str[MAX_FILTER_STR_VAL];
 
-static void last_cmd_set(char *str)
+static void last_cmd_set(struct trace_event_file *file, char *str)
 {
+	const char *system = NULL, *name = NULL;
+	struct trace_event_call *call;
+
 	if (!str)
 		return;
 
-	strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
+	strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+
+	if (file) {
+		call = file->event_call;
+
+		system = call->class->system;
+		if (system) {
+			name = trace_event_name(call);
+			if (!name)
+				system = NULL;
+		}
+	}
+
+	if (system)
+		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
 }
 
 static void hist_err(char *str, char *var)
@@ -583,6 +602,8 @@ static void hist_err_event(char *str, char *system, char *event, char *var)
 static void hist_err_clear(void)
 {
 	hist_err_str[0] = '\0';
+	last_cmd[0] = '\0';
+	last_cmd_loc[0] = '\0';
 }
 
 static bool have_hist_err(void)
@@ -5438,8 +5459,8 @@ static int hist_show(struct seq_file *m, void *v)
 	}
 
 	if (have_hist_err()) {
-		seq_printf(m, "\nERROR: %s\n", hist_err_str);
-		seq_printf(m, "  Last command: %s\n", last_hist_cmd);
+		seq_printf(m, "\n%s: error: \n", hist_err_str);
+		seq_printf(m, "  Last command: %s\n", last_cmd);
 	}
 
  out_unlock:
@@ -6043,8 +6064,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	lockdep_assert_held(&event_mutex);
 
 	if (glob && strlen(glob)) {
-		last_cmd_set(param);
 		hist_err_clear();
+		last_cmd_set(file, param);
 	}
 
 	if (!param)
-- 
cgit v1.2.3


From d566c5e9d1bad6773fe9cce3d4514cca2cc32e4e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 31 Mar 2019 18:48:17 -0500
Subject: tracing: Use tracing error_log with hist triggers

Replace hist_err() and hist_err_event() with tracing_log_err() from
the new tracing error_log mechanism.

Also add a couple related helper functions and remove most of the old
hist_err()-related code.

With this change, users no longer read the hist files for hist trigger
error information, but instead look at tracing/error_log for the same
information.

Link: http://lkml.kernel.org/r/c98f77a97c9715d18b623eeb5741057b330d5ac0.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 206 ++++++++++++++++++++-------------------
 1 file changed, 104 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0de702bf148f..071c62cacba7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -22,6 +22,57 @@
 
 #define STR_VAR_LEN_MAX		32 /* must be multiple of sizeof(u64) */
 
+#define ERRORS								\
+	C(NONE,			"No error"),				\
+	C(DUPLICATE_VAR,	"Variable already defined"),		\
+	C(VAR_NOT_UNIQUE,	"Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \
+	C(TOO_MANY_VARS,	"Too many variables defined"),		\
+	C(MALFORMED_ASSIGNMENT,	"Malformed assignment"),		\
+	C(NAMED_MISMATCH,	"Named hist trigger doesn't match existing named trigger (includes variables)"), \
+	C(TRIGGER_EEXIST,	"Hist trigger already exists"),		\
+	C(TRIGGER_ENOENT_CLEAR,	"Can't clear or continue a nonexistent hist trigger"), \
+	C(SET_CLOCK_FAIL,	"Couldn't set trace_clock"),		\
+	C(BAD_FIELD_MODIFIER,	"Invalid field modifier"),		\
+	C(TOO_MANY_SUBEXPR,	"Too many subexpressions (3 max)"),	\
+	C(TIMESTAMP_MISMATCH,	"Timestamp units in expression don't match"), \
+	C(TOO_MANY_FIELD_VARS,	"Too many field variables defined"),	\
+	C(EVENT_FILE_NOT_FOUND,	"Event file not found"),		\
+	C(HIST_NOT_FOUND,	"Matching event histogram not found"),	\
+	C(HIST_CREATE_FAIL,	"Couldn't create histogram for field"),	\
+	C(SYNTH_VAR_NOT_FOUND,	"Couldn't find synthetic variable"),	\
+	C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"),	\
+	C(SYNTH_TYPE_MISMATCH,	"Param type doesn't match synthetic event field type"), \
+	C(SYNTH_COUNT_MISMATCH,	"Param count doesn't match synthetic event field count"), \
+	C(FIELD_VAR_PARSE_FAIL,	"Couldn't parse field variable"),	\
+	C(VAR_CREATE_FIND_FAIL,	"Couldn't create or find variable"),	\
+	C(ONX_NOT_VAR,		"For onmax(x) or onchange(x), x must be a variable"), \
+	C(ONX_VAR_NOT_FOUND,	"Couldn't find onmax or onchange variable"), \
+	C(ONX_VAR_CREATE_FAIL,	"Couldn't create onmax or onchange variable"), \
+	C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"),	\
+	C(TOO_MANY_PARAMS,	"Too many action params"),		\
+	C(PARAM_NOT_FOUND,	"Couldn't find param"),			\
+	C(INVALID_PARAM,	"Invalid action param"),		\
+	C(ACTION_NOT_FOUND,	"No action found"),			\
+	C(NO_SAVE_PARAMS,	"No params found for save()"),		\
+	C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \
+	C(ACTION_MISMATCH,	"Handler doesn't support action"),	\
+	C(NO_CLOSING_PAREN,	"No closing paren found"),		\
+	C(SUBSYS_NOT_FOUND,	"Missing subsystem"),			\
+	C(INVALID_SUBSYS_EVENT,	"Invalid subsystem or event name"),	\
+	C(INVALID_REF_KEY,	"Using variable references as keys not supported"), \
+	C(VAR_NOT_FOUND,	"Couldn't find variable"),		\
+	C(FIELD_NOT_FOUND,	"Couldn't find field"),
+
+#undef C
+#define C(a, b)		HIST_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b)		b
+
+static const char *err_text[] = { ERRORS };
+
 struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field,
@@ -538,7 +589,10 @@ static struct track_data *track_data_alloc(unsigned int key_len,
 static char last_cmd[MAX_FILTER_STR_VAL];
 static char last_cmd_loc[MAX_FILTER_STR_VAL];
 
-static char hist_err_str[MAX_FILTER_STR_VAL];
+static int errpos(char *str)
+{
+	return err_pos(last_cmd, str);
+}
 
 static void last_cmd_set(struct trace_event_file *file, char *str)
 {
@@ -565,55 +619,17 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
 }
 
-static void hist_err(char *str, char *var)
+static void hist_err(u8 err_type, u8 err_pos)
 {
-	int maxlen = MAX_FILTER_STR_VAL - 1;
-
-	if (!str)
-		return;
-
-	if (strlen(hist_err_str))
-		return;
-
-	if (!var)
-		var = "";
-
-	if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
-		return;
-
-	strcat(hist_err_str, str);
-	strcat(hist_err_str, var);
-}
-
-static void hist_err_event(char *str, char *system, char *event, char *var)
-{
-	char err[MAX_FILTER_STR_VAL];
-
-	if (system && var)
-		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
-	else if (system)
-		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
-	else
-		strscpy(err, var, MAX_FILTER_STR_VAL);
-
-	hist_err(str, err);
+	tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos);
 }
 
 static void hist_err_clear(void)
 {
-	hist_err_str[0] = '\0';
 	last_cmd[0] = '\0';
 	last_cmd_loc[0] = '\0';
 }
 
-static bool have_hist_err(void)
-{
-	if (strlen(hist_err_str))
-		return true;
-
-	return false;
-}
-
 struct synth_trace_event {
 	struct trace_entry	ent;
 	u64			fields[];
@@ -1740,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,
 
 		if (find_var_field(var_hist_data, var_name)) {
 			if (found) {
-				hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+				hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
 				return NULL;
 			}
 
@@ -1791,7 +1807,7 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
 			hist_field = find_file_var(file, var_name);
 			if (hist_field) {
 				if (found) {
-					hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+					hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
 					return ERR_PTR(-EINVAL);
 				}
 
@@ -2023,7 +2039,6 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
 		attrs->n_actions++;
 		ret = 0;
 	}
-
 	return ret;
 }
 
@@ -2083,7 +2098,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 		char *assignment;
 
 		if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
-			hist_err("Too many variables defined: ", str);
+			hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -2681,8 +2696,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
 					   system, event_name);
 
 	if (!ref_field)
-		hist_err_event("Couldn't find variable: $",
-			       system, event_name, var_name);
+		hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
 
 	return ref_field;
 }
@@ -2716,7 +2730,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 		else if (strcmp(modifier, "usecs") == 0)
 			*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
 		else {
-			hist_err("Invalid field modifier: ", modifier);
+			hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
@@ -2732,7 +2746,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 	else {
 		field = trace_find_event_field(file->event_call, field_name);
 		if (!field || !field->size) {
-			hist_err("Couldn't find field: ", field_name);
+			hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
@@ -2843,7 +2857,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 	/* we support only -(xxx) i.e. explicit parens required */
 
 	if (level > 3) {
-		hist_err("Too many subexpressions (3 max): ", str);
+		hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
 		ret = -EINVAL;
 		goto free;
 	}
@@ -2926,7 +2940,7 @@ static int check_expr_operands(struct hist_field *operand1,
 
 	if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
 	    (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
-		hist_err("Timestamp units in expression don't match", NULL);
+		hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0);
 		return -EINVAL;
 	}
 
@@ -2944,7 +2958,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 	char *sep, *operand1_str;
 
 	if (level > 3) {
-		hist_err("Too many subexpressions (3 max): ", str);
+		hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3182,16 +3196,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	int ret;
 
 	if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
-		hist_err_event("trace action: Too many field variables defined: ",
-			       subsys_name, event_name, field_name);
+		hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
 	file = event_file(tr, subsys_name, event_name);
 
 	if (IS_ERR(file)) {
-		hist_err_event("trace action: Event file not found: ",
-			       subsys_name, event_name, field_name);
+		hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
 		ret = PTR_ERR(file);
 		return ERR_PTR(ret);
 	}
@@ -3204,8 +3216,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	 */
 	hist_data = find_compatible_hist(target_hist_data, file);
 	if (!hist_data) {
-		hist_err_event("trace action: Matching event histogram not found: ",
-			       subsys_name, event_name, field_name);
+		hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3266,8 +3277,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 		kfree(cmd);
 		kfree(var_hist->cmd);
 		kfree(var_hist);
-		hist_err_event("trace action: Couldn't create histogram for field: ",
-			       subsys_name, event_name, field_name);
+		hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
 		return ERR_PTR(ret);
 	}
 
@@ -3279,8 +3289,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	if (IS_ERR_OR_NULL(event_var)) {
 		kfree(var_hist->cmd);
 		kfree(var_hist);
-		hist_err_event("trace action: Couldn't find synthetic variable: ",
-			       subsys_name, event_name, field_name);
+		hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3417,21 +3426,21 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
 	int ret = 0;
 
 	if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
-		hist_err("Too many field variables defined: ", field_name);
+		hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
 		ret = -EINVAL;
 		goto err;
 	}
 
 	val = parse_atom(hist_data, file, field_name, &flags, NULL);
 	if (IS_ERR(val)) {
-		hist_err("Couldn't parse field variable: ", field_name);
+		hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
 		ret = PTR_ERR(val);
 		goto err;
 	}
 
 	var = create_var(hist_data, file, field_name, val->size, val->type);
 	if (IS_ERR(var)) {
-		hist_err("Couldn't create or find variable: ", field_name);
+		hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
 		kfree(val);
 		ret = PTR_ERR(var);
 		goto err;
@@ -3763,14 +3772,14 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 
 	track_data_var_str = data->track_data.var_str;
 	if (track_data_var_str[0] != '$') {
-		hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str);
+		hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
 		return -EINVAL;
 	}
 	track_data_var_str++;
 
 	var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
 	if (!var_field) {
-		hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str);
+		hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
 		return -EINVAL;
 	}
 
@@ -3783,7 +3792,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 	if (data->handler == HANDLER_ONMAX)
 		track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
 	if (IS_ERR(track_var)) {
-		hist_err("Couldn't create onmax variable: ", "__max");
+		hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
 		ret = PTR_ERR(track_var);
 		goto out;
 	}
@@ -3791,7 +3800,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 	if (data->handler == HANDLER_ONCHANGE)
 		track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
 	if (IS_ERR(track_var)) {
-		hist_err("Couldn't create onchange variable: ", "__change");
+		hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
 		ret = PTR_ERR(track_var);
 		goto out;
 	}
@@ -3810,20 +3819,20 @@ static int parse_action_params(char *params, struct action_data *data)
 
 	while (params) {
 		if (data->n_params >= SYNTH_FIELDS_MAX) {
-			hist_err("Too many action params", "");
+			hist_err(HIST_ERR_TOO_MANY_PARAMS, 0);
 			goto out;
 		}
 
 		param = strsep(&params, ",");
 		if (!param) {
-			hist_err("No action param found", "");
+			hist_err(HIST_ERR_PARAM_NOT_FOUND, 0);
 			ret = -EINVAL;
 			goto out;
 		}
 
 		param = strstrip(param);
 		if (strlen(param) < 2) {
-			hist_err("Invalid action param: ", param);
+			hist_err(HIST_ERR_INVALID_PARAM, errpos(param));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3855,14 +3864,14 @@ static int action_parse(char *str, struct action_data *data,
 
 	strsep(&str, ".");
 	if (!str) {
-		hist_err("action parsing: No action found", "");
+		hist_err(HIST_ERR_ACTION_NOT_FOUND, 0);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	action_name = strsep(&str, "(");
 	if (!action_name || !str) {
-		hist_err("action parsing: No action found", "");
+		hist_err(HIST_ERR_ACTION_NOT_FOUND, 0);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3871,7 +3880,7 @@ static int action_parse(char *str, struct action_data *data,
 		char *params = strsep(&str, ")");
 
 		if (!params) {
-			hist_err("action parsing: No params found for %s", "save");
+			hist_err(HIST_ERR_NO_SAVE_PARAMS, 0);
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3885,7 +3894,7 @@ static int action_parse(char *str, struct action_data *data,
 		else if (handler == HANDLER_ONCHANGE)
 			data->track_data.check_val = check_track_val_changed;
 		else {
-			hist_err("action parsing: Handler doesn't support action: ", action_name);
+			hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3897,7 +3906,7 @@ static int action_parse(char *str, struct action_data *data,
 		char *params = strsep(&str, ")");
 
 		if (!str) {
-			hist_err("action parsing: No closing paren found: %s", params);
+			hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3907,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data,
 		else if (handler == HANDLER_ONCHANGE)
 			data->track_data.check_val = check_track_val_changed;
 		else {
-			hist_err("action parsing: Handler doesn't support action: ", action_name);
+			hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -4060,7 +4069,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
 	}
 
 	if (!hist_field)
-		hist_err_event("trace action: Couldn't find param: $", system, event, var);
+		hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var));
 
 	return hist_field;
 }
@@ -4135,7 +4144,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
 
 	event = find_synth_event(synth_event_name);
 	if (!event) {
-		hist_err("trace action: Couldn't find synthetic event: ", synth_event_name);
+		hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
 		return -EINVAL;
 	}
 
@@ -4196,15 +4205,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
 			continue;
 		}
 
-		hist_err_event("trace action: Param type doesn't match synthetic event field type: ",
-			       system, event_name, param);
+		hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
 		kfree(p);
 		ret = -EINVAL;
 		goto err;
 	}
 
 	if (field_pos != event->n_fields) {
-		hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name);
+		hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
 		ret = -EINVAL;
 		goto err;
 	}
@@ -4250,7 +4258,7 @@ static int action_create(struct hist_trigger_data *hist_data,
 	if (data->action == ACTION_SAVE) {
 		if (hist_data->n_save_vars) {
 			ret = -EEXIST;
-			hist_err("save action: Can't have more than one save() action per hist", "");
+			hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
 			goto out;
 		}
 
@@ -4263,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data,
 
 			field_var = create_target_field_var(hist_data, NULL, NULL, param);
 			if (IS_ERR(field_var)) {
-				hist_err("save action: Couldn't create field variable: ", param);
+				hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param));
 				ret = PTR_ERR(field_var);
 				kfree(param);
 				goto out;
@@ -4297,19 +4305,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
 
 	match_event = strsep(&str, ")");
 	if (!match_event || !str) {
-		hist_err("onmatch: Missing closing paren: ", match_event);
+		hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
 		goto free;
 	}
 
 	match_event_system = strsep(&match_event, ".");
 	if (!match_event) {
-		hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+		hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
 		goto free;
 	}
 
 	if (IS_ERR(event_file(tr, match_event_system, match_event))) {
-		hist_err_event("onmatch: Invalid subsystem or event name: ",
-			       match_event_system, match_event, NULL);
+		hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
 		goto free;
 	}
 
@@ -4400,7 +4407,7 @@ static int create_var_field(struct hist_trigger_data *hist_data,
 		return -EINVAL;
 
 	if (find_var(hist_data, file, var_name) && !hist_data->remove) {
-		hist_err("Variable already defined: ", var_name);
+		hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name));
 		return -EINVAL;
 	}
 
@@ -4481,7 +4488,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		}
 
 		if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
-			hist_err("Using variable references as keys not supported: ", field_str);
+			hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str));
 			destroy_hist_field(hist_field, 0);
 			ret = -EINVAL;
 			goto out;
@@ -4595,13 +4602,13 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
 
 			var_name = strsep(&field_str, "=");
 			if (!var_name || !field_str) {
-				hist_err("Malformed assignment: ", var_name);
+				hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name));
 				ret = -EINVAL;
 				goto free;
 			}
 
 			if (n_vars == TRACING_MAP_VARS_MAX) {
-				hist_err("Too many variables defined: ", var_name);
+				hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name));
 				ret = -EINVAL;
 				goto free;
 			}
@@ -5458,11 +5465,6 @@ static int hist_show(struct seq_file *m, void *v)
 			hist_trigger_show(m, data, n++);
 	}
 
-	if (have_hist_err()) {
-		seq_printf(m, "\n%s: error: \n", hist_err_str);
-		seq_printf(m, "  Last command: %s\n", last_cmd);
-	}
-
  out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -5834,7 +5836,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		if (named_data) {
 			if (!hist_trigger_match(data, named_data, named_data,
 						true)) {
-				hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
+				hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
 				ret = -EINVAL;
 				goto out;
 			}
@@ -5855,7 +5857,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 			else if (hist_data->attrs->clear)
 				hist_clear(test);
 			else {
-				hist_err("Hist trigger already exists", NULL);
+				hist_err(HIST_ERR_TRIGGER_EEXIST, 0);
 				ret = -EEXIST;
 			}
 			goto out;
@@ -5863,7 +5865,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	}
  new:
 	if (hist_data->attrs->cont || hist_data->attrs->clear) {
-		hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
+		hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
 		ret = -ENOENT;
 		goto out;
 	}
@@ -5888,7 +5890,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 
 		ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
 		if (ret) {
-			hist_err("Couldn't set trace_clock: ", clock);
+			hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
 			goto out;
 		}
 
-- 
cgit v1.2.3


From 34f76afaac7a437a2ce381225135563928b359dd Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 31 Mar 2019 18:48:18 -0500
Subject: tracing: Use tracing error_log with trace event filters

Use tracing_log_err() from the new tracing error_log mechanism to send
filter parse errors to tracing/error_log.

With this change, users will be able to see filter errors by looking
at tracing/error_log.

The same errors will also be available in the filter file, as
expected.

Link: http://lkml.kernel.org/r/1d942c419941539a11d78a6810fc5740a99b2974.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 05a66493a164..290d42c59101 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -66,7 +66,8 @@ static const char * ops[] = { OPS };
 	C(INVALID_FILTER,	"Meaningless filter expression"),	\
 	C(IP_FIELD_ONLY,	"Only 'ip' field is supported for function trace"), \
 	C(INVALID_VALUE,	"Invalid value (did you forget quotes)?"), \
-	C(NO_FILTER,		"No filter found"),
+	C(ERRNO,		"Error"),				\
+	C(NO_FILTER,		"No filter found")
 
 #undef C
 #define C(a, b)		FILT_ERR_##a
@@ -76,7 +77,7 @@ enum { ERRORS };
 #undef C
 #define C(a, b)		b
 
-static char *err_text[] = { ERRORS };
+static const char *err_text[] = { ERRORS };
 
 /* Called after a '!' character but "!=" and "!~" are not "not"s */
 static bool is_not(const char *str)
@@ -947,8 +948,14 @@ static void append_filter_err(struct filter_parse_error *pe,
 	if (pe->lasterr > 0) {
 		trace_seq_printf(s, "\n%*s", pos, "^");
 		trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+		tracing_log_err("event filter parse error",
+				filter->filter_string, err_text,
+				pe->lasterr, pe->lasterr_pos);
 	} else {
 		trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+		tracing_log_err("event filter parse error",
+				filter->filter_string, err_text,
+				FILT_ERR_ERRNO, 0);
 	}
 	trace_seq_putc(s, 0);
 	buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
-- 
cgit v1.2.3


From ab105a4fb89496c71c5a0f3222347c506c30feb0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 31 Mar 2019 18:48:19 -0500
Subject: tracing: Use tracing error_log with probe events

Use tracing error_log with probe events for logging error more
precisely. This also makes all parse error returns -EINVAL
(except for -ENOMEM), because user can see better error message
in error_log file now.

Link: http://lkml.kernel.org/r/6a4d90e141d138040ea61f4776b991597077451e.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c |  77 ++++++++-----
 kernel/trace/trace_probe.c  | 274 +++++++++++++++++++++++++++++++-------------
 kernel/trace/trace_probe.h  |  77 ++++++++++++-
 kernel/trace/trace_uprobe.c |  44 ++++---
 4 files changed, 348 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5d5129b05df7..7d736248a070 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
 	else
 		ret = register_kprobe(&tk->rp.kp);
 
-	if (ret == 0) {
+	if (ret == 0)
 		tk->tp.flags |= TP_FLAG_REGISTERED;
-	} else if (ret == -EILSEQ) {
-		pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
-			tk->rp.kp.addr);
-		ret = -EINVAL;
-	}
 	return ret;
 }
 
@@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
 	 * Type of args:
 	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
-	struct trace_kprobe *tk;
+	struct trace_kprobe *tk = NULL;
 	int i, len, ret = 0;
 	bool is_return = false;
 	char *symbol = NULL, *tmp = NULL;
@@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[])
 	if (argc < 2)
 		return -ECANCELED;
 
+	trace_probe_log_init("trace_kprobe", argc, argv);
+
 	event = strchr(&argv[0][1], ':');
 	if (event)
 		event++;
 
 	if (isdigit(argv[0][1])) {
 		if (!is_return) {
-			pr_info("Maxactive is not for kprobe");
-			return -EINVAL;
+			trace_probe_log_err(1, MAXACT_NO_KPROBE);
+			goto parse_error;
 		}
 		if (event)
 			len = event - &argv[0][1] - 1;
 		else
 			len = strlen(&argv[0][1]);
-		if (len > MAX_EVENT_NAME_LEN - 1)
-			return -E2BIG;
+		if (len > MAX_EVENT_NAME_LEN - 1) {
+			trace_probe_log_err(1, BAD_MAXACT);
+			goto parse_error;
+		}
 		memcpy(buf, &argv[0][1], len);
 		buf[len] = '\0';
 		ret = kstrtouint(buf, 0, &maxactive);
 		if (ret || !maxactive) {
-			pr_info("Invalid maxactive number\n");
-			return ret;
+			trace_probe_log_err(1, BAD_MAXACT);
+			goto parse_error;
 		}
 		/* kretprobes instances are iterated over via a list. The
 		 * maximum should stay reasonable.
 		 */
 		if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
-			pr_info("Maxactive is too big (%d > %d).\n",
-				maxactive, KRETPROBE_MAXACTIVE_MAX);
-			return -E2BIG;
+			trace_probe_log_err(1, MAXACT_TOO_BIG);
+			goto parse_error;
 		}
 	}
 
 	/* try to parse an address. if that fails, try to read the
 	 * input as a symbol. */
 	if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
+		trace_probe_log_set_index(1);
 		/* Check whether uprobe event specified */
-		if (strchr(argv[1], '/') && strchr(argv[1], ':'))
-			return -ECANCELED;
+		if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
+			ret = -ECANCELED;
+			goto error;
+		}
 		/* a symbol specified */
 		symbol = kstrdup(argv[1], GFP_KERNEL);
 		if (!symbol)
@@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[])
 		/* TODO: support .init module functions */
 		ret = traceprobe_split_symbol_offset(symbol, &offset);
 		if (ret || offset < 0 || offset > UINT_MAX) {
-			pr_info("Failed to parse either an address or a symbol.\n");
-			goto out;
+			trace_probe_log_err(0, BAD_PROBE_ADDR);
+			goto parse_error;
 		}
 		if (kprobe_on_func_entry(NULL, symbol, offset))
 			flags |= TPARG_FL_FENTRY;
 		if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
-			pr_info("Given offset is not valid for return probe.\n");
-			ret = -EINVAL;
-			goto out;
+			trace_probe_log_err(0, BAD_RETPROBE);
+			goto parse_error;
 		}
 	}
-	argc -= 2; argv += 2;
 
+	trace_probe_log_set_index(0);
 	if (event) {
-		ret = traceprobe_parse_event_name(&event, &group, buf);
+		ret = traceprobe_parse_event_name(&event, &group, buf,
+						  event - argv[0]);
 		if (ret)
-			goto out;
+			goto parse_error;
 	} else {
 		/* Make a new event name */
 		if (symbol)
@@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[])
 
 	/* setup a probe */
 	tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
-			       argc, is_return);
+			       argc - 2, is_return);
 	if (IS_ERR(tk)) {
 		ret = PTR_ERR(tk);
-		/* This must return -ENOMEM otherwise there is a bug */
+		/* This must return -ENOMEM, else there is a bug */
 		WARN_ON_ONCE(ret != -ENOMEM);
-		goto out;
+		goto out;	/* We know tk is not allocated */
 	}
+	argc -= 2; argv += 2;
 
 	/* parse arguments */
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
@@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[])
 			goto error;
 		}
 
+		trace_probe_log_set_index(i + 2);
 		ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
 		kfree(tmp);
 		if (ret)
-			goto error;
+			goto error;	/* This can be -ENOMEM */
 	}
 
 	ret = register_trace_kprobe(tk);
-	if (ret)
+	if (ret) {
+		trace_probe_log_set_index(1);
+		if (ret == -EILSEQ)
+			trace_probe_log_err(0, BAD_INSN_BNDRY);
+		else if (ret == -ENOENT)
+			trace_probe_log_err(0, BAD_PROBE_ADDR);
+		else if (ret != -ENOMEM)
+			trace_probe_log_err(0, FAIL_REG_PROBE);
 		goto error;
+	}
+
 out:
+	trace_probe_log_clear();
 	kfree(symbol);
 	return ret;
 
+parse_error:
+	ret = -EINVAL;
 error:
 	free_trace_kprobe(tk);
 	goto out;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8f8411e7835f..e11f98c49d72 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,6 +13,11 @@
 
 #include "trace_probe.h"
 
+#undef C
+#define C(a, b)		b
+
+static const char *trace_probe_err_text[] = { ERRORS };
+
 static const char *reserved_field_names[] = {
 	"common_type",
 	"common_flags",
@@ -133,6 +138,60 @@ fail:
 	return NULL;
 }
 
+static struct trace_probe_log trace_probe_log;
+
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
+{
+	trace_probe_log.subsystem = subsystem;
+	trace_probe_log.argc = argc;
+	trace_probe_log.argv = argv;
+	trace_probe_log.index = 0;
+}
+
+void trace_probe_log_clear(void)
+{
+	memset(&trace_probe_log, 0, sizeof(trace_probe_log));
+}
+
+void trace_probe_log_set_index(int index)
+{
+	trace_probe_log.index = index;
+}
+
+void __trace_probe_log_err(int offset, int err_type)
+{
+	char *command, *p;
+	int i, len = 0, pos = 0;
+
+	if (!trace_probe_log.argv)
+		return;
+
+	/* Recalcurate the length and allocate buffer */
+	for (i = 0; i < trace_probe_log.argc; i++) {
+		if (i == trace_probe_log.index)
+			pos = len;
+		len += strlen(trace_probe_log.argv[i]) + 1;
+	}
+	command = kzalloc(len, GFP_KERNEL);
+	if (!command)
+		return;
+
+	/* And make a command string from argv array */
+	p = command;
+	for (i = 0; i < trace_probe_log.argc; i++) {
+		len = strlen(trace_probe_log.argv[i]);
+		strcpy(p, trace_probe_log.argv[i]);
+		p[len] = ' ';
+		p += len + 1;
+	}
+	*(p - 1) = '\0';
+
+	tracing_log_err(trace_probe_log.subsystem, command,
+			trace_probe_err_text, err_type, pos + offset);
+
+	kfree(command);
+}
+
 /* Split symbol and offset. */
 int traceprobe_split_symbol_offset(char *symbol, long *offset)
 {
@@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
 
 /* @buf must has MAX_EVENT_NAME_LEN size */
 int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
-				char *buf)
+				char *buf, int offset)
 {
 	const char *slash, *event = *pevent;
 	int len;
@@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
 	slash = strchr(event, '/');
 	if (slash) {
 		if (slash == event) {
-			pr_info("Group name is not specified\n");
+			trace_probe_log_err(offset, NO_GROUP_NAME);
 			return -EINVAL;
 		}
 		if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
-			pr_info("Group name is too long\n");
-			return -E2BIG;
+			trace_probe_log_err(offset, GROUP_TOO_LONG);
+			return -EINVAL;
 		}
 		strlcpy(buf, event, slash - event + 1);
 		if (!is_good_name(buf)) {
-			pr_info("Group name must follow the same rules as C identifiers\n");
+			trace_probe_log_err(offset, BAD_GROUP_NAME);
 			return -EINVAL;
 		}
 		*pgroup = buf;
 		*pevent = slash + 1;
+		offset += slash - event + 1;
 		event = *pevent;
 	}
 	len = strlen(event);
 	if (len == 0) {
-		pr_info("Event name is not specified\n");
+		trace_probe_log_err(offset, NO_EVENT_NAME);
 		return -EINVAL;
 	} else if (len > MAX_EVENT_NAME_LEN) {
-		pr_info("Event name is too long\n");
-		return -E2BIG;
+		trace_probe_log_err(offset, EVENT_TOO_LONG);
+		return -EINVAL;
 	}
 	if (!is_good_name(event)) {
-		pr_info("Event name must follow the same rules as C identifiers\n");
+		trace_probe_log_err(offset, BAD_EVENT_NAME);
 		return -EINVAL;
 	}
 	return 0;
@@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
 static int parse_probe_vars(char *arg, const struct fetch_type *t,
-			    struct fetch_insn *code, unsigned int flags)
+			struct fetch_insn *code, unsigned int flags, int offs)
 {
 	unsigned long param;
 	int ret = 0;
 	int len;
 
 	if (strcmp(arg, "retval") == 0) {
-		if (flags & TPARG_FL_RETURN)
+		if (flags & TPARG_FL_RETURN) {
 			code->op = FETCH_OP_RETVAL;
-		else
+		} else {
+			trace_probe_log_err(offs, RETVAL_ON_PROBE);
 			ret = -EINVAL;
+		}
 	} else if ((len = str_has_prefix(arg, "stack"))) {
 		if (arg[len] == '\0') {
 			code->op = FETCH_OP_STACKP;
 		} else if (isdigit(arg[len])) {
 			ret = kstrtoul(arg + len, 10, &param);
-			if (ret || ((flags & TPARG_FL_KERNEL) &&
-				    param > PARAM_MAX_STACK))
+			if (ret) {
+				goto inval_var;
+			} else if ((flags & TPARG_FL_KERNEL) &&
+				    param > PARAM_MAX_STACK) {
+				trace_probe_log_err(offs, BAD_STACK_NUM);
 				ret = -EINVAL;
-			else {
+			} else {
 				code->op = FETCH_OP_STACK;
 				code->param = (unsigned int)param;
 			}
 		} else
-			ret = -EINVAL;
+			goto inval_var;
 	} else if (strcmp(arg, "comm") == 0) {
 		code->op = FETCH_OP_COMM;
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 	} else if (((flags & TPARG_FL_MASK) ==
 		    (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
 		   (len = str_has_prefix(arg, "arg"))) {
-		if (!isdigit(arg[len]))
-			return -EINVAL;
 		ret = kstrtoul(arg + len, 10, &param);
-		if (ret || !param || param > PARAM_MAX_STACK)
+		if (ret) {
+			goto inval_var;
+		} else if (!param || param > PARAM_MAX_STACK) {
+			trace_probe_log_err(offs, BAD_ARG_NUM);
 			return -EINVAL;
+		}
 		code->op = FETCH_OP_ARG;
 		code->param = (unsigned int)param - 1;
 #endif
 	} else
-		ret = -EINVAL;
+		goto inval_var;
 
 	return ret;
+
+inval_var:
+	trace_probe_log_err(offs, BAD_VAR);
+	return -EINVAL;
 }
 
 /* Recursive argument parser */
 static int
 parse_probe_arg(char *arg, const struct fetch_type *type,
 		struct fetch_insn **pcode, struct fetch_insn *end,
-		unsigned int flags)
+		unsigned int flags, int offs)
 {
 	struct fetch_insn *code = *pcode;
 	unsigned long param;
@@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 
 	switch (arg[0]) {
 	case '$':
-		ret = parse_probe_vars(arg + 1, type, code, flags);
+		ret = parse_probe_vars(arg + 1, type, code, flags, offs);
 		break;
 
 	case '%':	/* named register */
@@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 			code->op = FETCH_OP_REG;
 			code->param = (unsigned int)ret;
 			ret = 0;
-		}
+		} else
+			trace_probe_log_err(offs, BAD_REG_NAME);
 		break;
 
 	case '@':	/* memory, file-offset or symbol */
 		if (isdigit(arg[1])) {
 			ret = kstrtoul(arg + 1, 0, &param);
-			if (ret)
+			if (ret) {
+				trace_probe_log_err(offs, BAD_MEM_ADDR);
 				break;
+			}
 			/* load address */
 			code->op = FETCH_OP_IMM;
 			code->immediate = param;
 		} else if (arg[1] == '+') {
 			/* kprobes don't support file offsets */
-			if (flags & TPARG_FL_KERNEL)
+			if (flags & TPARG_FL_KERNEL) {
+				trace_probe_log_err(offs, FILE_ON_KPROBE);
 				return -EINVAL;
-
+			}
 			ret = kstrtol(arg + 2, 0, &offset);
-			if (ret)
+			if (ret) {
+				trace_probe_log_err(offs, BAD_FILE_OFFS);
 				break;
+			}
 
 			code->op = FETCH_OP_FOFFS;
 			code->immediate = (unsigned long)offset;  // imm64?
 		} else {
 			/* uprobes don't support symbols */
-			if (!(flags & TPARG_FL_KERNEL))
+			if (!(flags & TPARG_FL_KERNEL)) {
+				trace_probe_log_err(offs, SYM_ON_UPROBE);
 				return -EINVAL;
-
+			}
 			/* Preserve symbol for updating */
 			code->op = FETCH_NOP_SYMBOL;
 			code->data = kstrdup(arg + 1, GFP_KERNEL);
 			if (!code->data)
 				return -ENOMEM;
-			if (++code == end)
-				return -E2BIG;
-
+			if (++code == end) {
+				trace_probe_log_err(offs, TOO_MANY_OPS);
+				return -EINVAL;
+			}
 			code->op = FETCH_OP_IMM;
 			code->immediate = 0;
 		}
 		/* These are fetching from memory */
-		if (++code == end)
-			return -E2BIG;
+		if (++code == end) {
+			trace_probe_log_err(offs, TOO_MANY_OPS);
+			return -EINVAL;
+		}
 		*pcode = code;
 		code->op = FETCH_OP_DEREF;
 		code->offset = offset;
@@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 		/* fall through */
 	case '-':
 		tmp = strchr(arg, '(');
-		if (!tmp)
+		if (!tmp) {
+			trace_probe_log_err(offs, DEREF_NEED_BRACE);
 			return -EINVAL;
-
+		}
 		*tmp = '\0';
 		ret = kstrtol(arg, 0, &offset);
-		if (ret)
+		if (ret) {
+			trace_probe_log_err(offs, BAD_DEREF_OFFS);
 			break;
-
+		}
+		offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
-
-		if (tmp) {
+		if (!tmp) {
+			trace_probe_log_err(offs + strlen(arg),
+					    DEREF_OPEN_BRACE);
+			return -EINVAL;
+		} else {
 			const struct fetch_type *t2 = find_fetch_type(NULL);
 
 			*tmp = '\0';
-			ret = parse_probe_arg(arg, t2, &code, end, flags);
+			ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
 			if (ret)
 				break;
-			if (code->op == FETCH_OP_COMM)
+			if (code->op == FETCH_OP_COMM) {
+				trace_probe_log_err(offs, COMM_CANT_DEREF);
+				return -EINVAL;
+			}
+			if (++code == end) {
+				trace_probe_log_err(offs, TOO_MANY_OPS);
 				return -EINVAL;
-			if (++code == end)
-				return -E2BIG;
+			}
 			*pcode = code;
 
 			code->op = FETCH_OP_DEREF;
@@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 	}
 	if (!ret && code->op == FETCH_OP_NOP) {
 		/* Parsed, but do not find fetch method */
+		trace_probe_log_err(offs, BAD_FETCH_ARG);
 		ret = -EINVAL;
 	}
 	return ret;
@@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf,
 		return -EINVAL;
 	code++;
 	if (code->op != FETCH_OP_NOP)
-		return -E2BIG;
+		return -EINVAL;
 	*pcode = code;
 
 	code->op = FETCH_OP_MOD_BF;
@@ -392,32 +484,53 @@ static int __parse_bitfield_probe_arg(const char *bf,
 
 /* String length checking wrapper */
 static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
-		struct probe_arg *parg, unsigned int flags)
+		struct probe_arg *parg, unsigned int flags, int offset)
 {
 	struct fetch_insn *code, *scode, *tmp = NULL;
-	char *t, *t2;
+	char *t, *t2, *t3;
 	int ret, len;
 
-	if (strlen(arg) > MAX_ARGSTR_LEN) {
-		pr_info("Argument is too long.: %s\n",  arg);
-		return -ENOSPC;
+	len = strlen(arg);
+	if (len > MAX_ARGSTR_LEN) {
+		trace_probe_log_err(offset, ARG_TOO_LONG);
+		return -EINVAL;
+	} else if (len == 0) {
+		trace_probe_log_err(offset, NO_ARG_BODY);
+		return -EINVAL;
 	}
+
 	parg->comm = kstrdup(arg, GFP_KERNEL);
-	if (!parg->comm) {
-		pr_info("Failed to allocate memory for command '%s'.\n", arg);
+	if (!parg->comm)
 		return -ENOMEM;
-	}
+
 	t = strchr(arg, ':');
 	if (t) {
 		*t = '\0';
 		t2 = strchr(++t, '[');
 		if (t2) {
-			*t2 = '\0';
-			parg->count = simple_strtoul(t2 + 1, &t2, 0);
-			if (strcmp(t2, "]") || parg->count == 0)
+			*t2++ = '\0';
+			t3 = strchr(t2, ']');
+			if (!t3) {
+				offset += t2 + strlen(t2) - arg;
+				trace_probe_log_err(offset,
+						    ARRAY_NO_CLOSE);
+				return -EINVAL;
+			} else if (t3[1] != '\0') {
+				trace_probe_log_err(offset + t3 + 1 - arg,
+						    BAD_ARRAY_SUFFIX);
+				return -EINVAL;
+			}
+			*t3 = '\0';
+			if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
+				trace_probe_log_err(offset + t2 - arg,
+						    BAD_ARRAY_NUM);
 				return -EINVAL;
-			if (parg->count > MAX_ARRAY_LEN)
-				return -E2BIG;
+			}
+			if (parg->count > MAX_ARRAY_LEN) {
+				trace_probe_log_err(offset + t2 - arg,
+						    ARRAY_TOO_BIG);
+				return -EINVAL;
+			}
 		}
 	}
 	/*
@@ -429,7 +542,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	else
 		parg->type = find_fetch_type(t);
 	if (!parg->type) {
-		pr_info("Unsupported type: %s\n", t);
+		trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
 		return -EINVAL;
 	}
 	parg->offset = *size;
@@ -450,7 +563,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
 
 	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
-			      flags);
+			      flags, offset);
 	if (ret)
 		goto fail;
 
@@ -458,7 +571,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	if (!strcmp(parg->type->name, "string")) {
 		if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
 		    code->op != FETCH_OP_COMM) {
-			pr_info("string only accepts memory or address.\n");
+			trace_probe_log_err(offset + (t ? (t - arg) : 0),
+					    BAD_STRING);
 			ret = -EINVAL;
 			goto fail;
 		}
@@ -470,7 +584,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 			 */
 			code++;
 			if (code->op != FETCH_OP_NOP) {
-				ret = -E2BIG;
+				trace_probe_log_err(offset, TOO_MANY_OPS);
+				ret = -EINVAL;
 				goto fail;
 			}
 		}
@@ -483,7 +598,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	} else {
 		code++;
 		if (code->op != FETCH_OP_NOP) {
-			ret = -E2BIG;
+			trace_probe_log_err(offset, TOO_MANY_OPS);
+			ret = -EINVAL;
 			goto fail;
 		}
 		code->op = FETCH_OP_ST_RAW;
@@ -493,20 +609,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	/* Modify operation */
 	if (t != NULL) {
 		ret = __parse_bitfield_probe_arg(t, parg->type, &code);
-		if (ret)
+		if (ret) {
+			trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
 			goto fail;
+		}
 	}
 	/* Loop(Array) operation */
 	if (parg->count) {
 		if (scode->op != FETCH_OP_ST_MEM &&
 		    scode->op != FETCH_OP_ST_STRING) {
-			pr_info("array only accepts memory or address\n");
+			trace_probe_log_err(offset + (t ? (t - arg) : 0),
+					    BAD_STRING);
 			ret = -EINVAL;
 			goto fail;
 		}
 		code++;
 		if (code->op != FETCH_OP_NOP) {
-			ret = -E2BIG;
+			trace_probe_log_err(offset, TOO_MANY_OPS);
+			ret = -EINVAL;
 			goto fail;
 		}
 		code->op = FETCH_OP_LP_ARRAY;
@@ -555,15 +675,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
 {
 	struct probe_arg *parg = &tp->args[i];
 	char *body;
-	int ret;
 
 	/* Increment count for freeing args in error case */
 	tp->nr_args++;
 
 	body = strchr(arg, '=');
 	if (body) {
-		if (body - arg > MAX_ARG_NAME_LEN || body == arg)
+		if (body - arg > MAX_ARG_NAME_LEN) {
+			trace_probe_log_err(0, ARG_NAME_TOO_LONG);
+			return -EINVAL;
+		} else if (body == arg) {
+			trace_probe_log_err(0, NO_ARG_NAME);
 			return -EINVAL;
+		}
 		parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
 		body++;
 	} else {
@@ -575,22 +699,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
 		return -ENOMEM;
 
 	if (!is_good_name(parg->name)) {
-		pr_info("Invalid argument[%d] name: %s\n",
-			i, parg->name);
+		trace_probe_log_err(0, BAD_ARG_NAME);
 		return -EINVAL;
 	}
-
 	if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
-		pr_info("Argument[%d]: '%s' conflicts with another field.\n",
-			i, parg->name);
+		trace_probe_log_err(0, USED_ARG_NAME);
 		return -EINVAL;
 	}
-
 	/* Parse fetch argument */
-	ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags);
-	if (ret)
-		pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
-	return ret;
+	return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
+					       body - arg);
 }
 
 void traceprobe_free_probe_arg(struct probe_arg *arg)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2177c206de15..b7737666c1a8 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -280,8 +280,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
-extern int traceprobe_parse_event_name(const char **pevent,
-				       const char **pgroup, char *buf);
+int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
+				char *buf, int offset);
 
 extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
 
@@ -298,3 +298,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
 #endif
 extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 					size_t offset, struct trace_probe *tp);
+
+#undef ERRORS
+#define ERRORS	\
+	C(FILE_NOT_FOUND,	"Failed to find the given file"),	\
+	C(NO_REGULAR_FILE,	"Not a regular file"),			\
+	C(BAD_REFCNT,		"Invalid reference counter offset"),	\
+	C(REFCNT_OPEN_BRACE,	"Reference counter brace is not closed"), \
+	C(BAD_REFCNT_SUFFIX,	"Reference counter has wrong suffix"),	\
+	C(BAD_UPROBE_OFFS,	"Invalid uprobe offset"),		\
+	C(MAXACT_NO_KPROBE,	"Maxactive is not for kprobe"),		\
+	C(BAD_MAXACT,		"Invalid maxactive number"),		\
+	C(MAXACT_TOO_BIG,	"Maxactive is too big"),		\
+	C(BAD_PROBE_ADDR,	"Invalid probed address or symbol"),	\
+	C(BAD_RETPROBE,		"Retprobe address must be an function entry"), \
+	C(NO_GROUP_NAME,	"Group name is not specified"),		\
+	C(GROUP_TOO_LONG,	"Group name is too long"),		\
+	C(BAD_GROUP_NAME,	"Group name must follow the same rules as C identifiers"), \
+	C(NO_EVENT_NAME,	"Event name is not specified"),		\
+	C(EVENT_TOO_LONG,	"Event name is too long"),		\
+	C(BAD_EVENT_NAME,	"Event name must follow the same rules as C identifiers"), \
+	C(RETVAL_ON_PROBE,	"$retval is not available on probe"),	\
+	C(BAD_STACK_NUM,	"Invalid stack number"),		\
+	C(BAD_ARG_NUM,		"Invalid argument number"),		\
+	C(BAD_VAR,		"Invalid $-valiable specified"),	\
+	C(BAD_REG_NAME,		"Invalid register name"),		\
+	C(BAD_MEM_ADDR,		"Invalid memory address"),		\
+	C(FILE_ON_KPROBE,	"File offset is not available with kprobe"), \
+	C(BAD_FILE_OFFS,	"Invalid file offset value"),		\
+	C(SYM_ON_UPROBE,	"Symbol is not available with uprobe"),	\
+	C(TOO_MANY_OPS,		"Dereference is too much nested"), 	\
+	C(DEREF_NEED_BRACE,	"Dereference needs a brace"),		\
+	C(BAD_DEREF_OFFS,	"Invalid dereference offset"),		\
+	C(DEREF_OPEN_BRACE,	"Dereference brace is not closed"),	\
+	C(COMM_CANT_DEREF,	"$comm can not be dereferenced"),	\
+	C(BAD_FETCH_ARG,	"Invalid fetch argument"),		\
+	C(ARRAY_NO_CLOSE,	"Array is not closed"),			\
+	C(BAD_ARRAY_SUFFIX,	"Array has wrong suffix"),		\
+	C(BAD_ARRAY_NUM,	"Invalid array size"),			\
+	C(ARRAY_TOO_BIG,	"Array number is too big"),		\
+	C(BAD_TYPE,		"Unknown type is specified"),		\
+	C(BAD_STRING,		"String accepts only memory argument"),	\
+	C(BAD_BITFIELD,		"Invalid bitfield"),			\
+	C(ARG_NAME_TOO_LONG,	"Argument name is too long"),		\
+	C(NO_ARG_NAME,		"Argument name is not specified"),	\
+	C(BAD_ARG_NAME,		"Argument name must follow the same rules as C identifiers"), \
+	C(USED_ARG_NAME,	"This argument name is already used"),	\
+	C(ARG_TOO_LONG,		"Argument expression is too long"),	\
+	C(NO_ARG_BODY,		"No argument expression"),		\
+	C(BAD_INSN_BNDRY,	"Probe point is not an instruction boundary"),\
+	C(FAIL_REG_PROBE,	"Failed to register probe event"),
+
+#undef C
+#define C(a, b)		TP_ERR_##a
+
+/* Define TP_ERR_ */
+enum { ERRORS };
+
+/* Error text is defined in trace_probe.c */
+
+struct trace_probe_log {
+	const char	*subsystem;
+	const char	**argv;
+	int		argc;
+	int		index;
+};
+
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
+void trace_probe_log_set_index(int index);
+void trace_probe_log_clear(void);
+void __trace_probe_log_err(int offset, int err);
+
+#define trace_probe_log_err(offs, err)	\
+	__trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index be78d99ee6bc..cd8750a72768 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -457,13 +457,19 @@ static int trace_uprobe_create(int argc, const char **argv)
 		return -ECANCELED;
 	}
 
+	trace_probe_log_init("trace_uprobe", argc, argv);
+	trace_probe_log_set_index(1);	/* filename is the 2nd argument */
+
 	*arg++ = '\0';
 	ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 	if (ret) {
+		trace_probe_log_err(0, FILE_NOT_FOUND);
 		kfree(filename);
+		trace_probe_log_clear();
 		return ret;
 	}
 	if (!d_is_reg(path.dentry)) {
+		trace_probe_log_err(0, NO_REGULAR_FILE);
 		ret = -EINVAL;
 		goto fail_address_parse;
 	}
@@ -472,9 +478,16 @@ static int trace_uprobe_create(int argc, const char **argv)
 	rctr = strchr(arg, '(');
 	if (rctr) {
 		rctr_end = strchr(rctr, ')');
-		if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+		if (!rctr_end) {
+			ret = -EINVAL;
+			rctr_end = rctr + strlen(rctr);
+			trace_probe_log_err(rctr_end - filename,
+					    REFCNT_OPEN_BRACE);
+			goto fail_address_parse;
+		} else if (rctr_end[1] != '\0') {
 			ret = -EINVAL;
-			pr_info("Invalid reference counter offset.\n");
+			trace_probe_log_err(rctr_end + 1 - filename,
+					    BAD_REFCNT_SUFFIX);
 			goto fail_address_parse;
 		}
 
@@ -482,22 +495,23 @@ static int trace_uprobe_create(int argc, const char **argv)
 		*rctr_end = '\0';
 		ret = kstrtoul(rctr, 0, &ref_ctr_offset);
 		if (ret) {
-			pr_info("Invalid reference counter offset.\n");
+			trace_probe_log_err(rctr - filename, BAD_REFCNT);
 			goto fail_address_parse;
 		}
 	}
 
 	/* Parse uprobe offset. */
 	ret = kstrtoul(arg, 0, &offset);
-	if (ret)
+	if (ret) {
+		trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
 		goto fail_address_parse;
-
-	argc -= 2;
-	argv += 2;
+	}
 
 	/* setup a probe */
+	trace_probe_log_set_index(0);
 	if (event) {
-		ret = traceprobe_parse_event_name(&event, &group, buf);
+		ret = traceprobe_parse_event_name(&event, &group, buf,
+						  event - argv[0]);
 		if (ret)
 			goto fail_address_parse;
 	} else {
@@ -519,6 +533,9 @@ static int trace_uprobe_create(int argc, const char **argv)
 		kfree(tail);
 	}
 
+	argc -= 2;
+	argv += 2;
+
 	tu = alloc_trace_uprobe(group, event, argc, is_return);
 	if (IS_ERR(tu)) {
 		ret = PTR_ERR(tu);
@@ -539,6 +556,7 @@ static int trace_uprobe_create(int argc, const char **argv)
 			goto error;
 		}
 
+		trace_probe_log_set_index(i + 2);
 		ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
 					is_return ? TPARG_FL_RETURN : 0);
 		kfree(tmp);
@@ -547,20 +565,20 @@ static int trace_uprobe_create(int argc, const char **argv)
 	}
 
 	ret = register_trace_uprobe(tu);
-	if (ret)
-		goto error;
-	return 0;
+	if (!ret)
+		goto out;
 
 error:
 	free_trace_uprobe(tu);
+out:
+	trace_probe_log_clear();
 	return ret;
 
 fail_address_parse:
+	trace_probe_log_clear();
 	path_put(&path);
 	kfree(filename);
 
-	pr_info("Failed to parse address or file.\n");
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From d18bf4229b1772e91c0c36772737c01cf9726720 Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Date: Tue, 12 Mar 2019 04:06:37 -0400
Subject: perf/core: Make perf_swevent_init_cpu() static

'make W=1' causes GCC to complain:

  kernel/events/core.c:11877:6: warning: no previous prototype for 'perf_swevent_init_cpu' [-Wmissing-prototypes]

It's not referenced anywhere else, make it static.

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/28974.1552377997@turing-police
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72d06e302e99..dfc4bab0b02b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11878,7 +11878,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }
 
-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
-- 
cgit v1.2.3


From 6455959819bf2469190ae9f6b4ccebaa9827e884 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 19 Feb 2019 14:38:37 +0100
Subject: ia64/tlb: Eradicate tlb_migrate_finish() callback

Only ia64-sn2 uses this as an optimization, and there it is of
questionable correctness due to the mm_users==1 test.

Remove it entirely.

No change in behavior intended.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778c48a7fda..ade3f2287d1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,7 +1151,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p)) {
 		/*
-- 
cgit v1.2.3


From 4a6c91fbdef846ec7250b82f2eeeb87ac5f18cf9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Mar 2019 11:09:13 +0100
Subject: x86/uaccess, ftrace: Fix ftrace_likely_update() vs. SMAP

For CONFIG_TRACE_BRANCH_PROFILING=y the likely/unlikely things get
overloaded and generate callouts to this code, and thus also when
AC=1.

Make it safe.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_branch.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 			  int expect, int is_constant)
 {
+	unsigned long flags = user_access_save();
+
 	/* A constant is always correct */
 	if (is_constant) {
 		f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 		f->data.correct++;
 	else
 		f->data.incorrect++;
+
+	user_access_restore(flags);
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
-- 
cgit v1.2.3


From 40ea97290b08be2e038b31cbb33097d1145e8169 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Mar 2019 19:54:25 +0100
Subject: x86/uaccess, kcov: Disable stack protector

New tooling noticed this mishap:

  kernel/kcov.o: warning: objtool: write_comp_data()+0x138: call to __stack_chk_fail() with UACCESS enabled
  kernel/kcov.o: warning: objtool: __sanitizer_cov_trace_pc()+0xd9: call to __stack_chk_fail() with UACCESS enabled

All the other instrumentation (KASAN,UBSAN) also have stack protector
disabled.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..62471e75a2b0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
 # Don't self-instrument.
 KCOV_INSTRUMENT_kcov.o := n
 KASAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
-- 
cgit v1.2.3


From a1247d06d01045d7ab2882a9c074fbf21137c690 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 19 Mar 2019 13:18:56 +0100
Subject: locking/static_key: Fix false positive warnings on concurrent dec/inc

Even though the atomic_dec_and_mutex_lock() in
__static_key_slow_dec_cpuslocked() can never see a negative value in
key->enabled the subsequent sanity check is re-reading key->enabled, which may
have been set to -1 in the meantime by static_key_slow_inc_cpuslocked().

                CPU  A                               CPU B

 __static_key_slow_dec_cpuslocked():          static_key_slow_inc_cpuslocked():
                               # enabled = 1
   atomic_dec_and_mutex_lock()
                               # enabled = 0
                                              atomic_read() == 0
                                              atomic_set(-1)
                               # enabled = -1
   val = atomic_read()
   # Oops - val == -1!

The test case is TCP's clean_acked_data_enable() / clean_acked_data_disable()
as tickled by KTLS (net/ktls).

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Tested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: ard.biesheuvel@linaro.org
Cc: oss-drivers@netronome.com
Cc: pbonzini@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bad96b476eb6..a799b1ac6b2f 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -206,6 +206,8 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
 					   unsigned long rate_limit,
 					   struct delayed_work *work)
 {
+	int val;
+
 	lockdep_assert_cpus_held();
 
 	/*
@@ -215,17 +217,20 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
 	 * returns is unbalanced, because all other static_key_slow_inc()
 	 * instances block while the update is in progress.
 	 */
-	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
-		WARN(atomic_read(&key->enabled) < 0,
-		     "jump label: negative count!\n");
+	val = atomic_fetch_add_unless(&key->enabled, -1, 1);
+	if (val != 1) {
+		WARN(val < 0, "jump label: negative count!\n");
 		return;
 	}
 
-	if (rate_limit) {
-		atomic_inc(&key->enabled);
-		schedule_delayed_work(work, rate_limit);
-	} else {
-		jump_label_update(key);
+	jump_label_lock();
+	if (atomic_dec_and_test(&key->enabled)) {
+		if (rate_limit) {
+			atomic_inc(&key->enabled);
+			schedule_delayed_work(work, rate_limit);
+		} else {
+			jump_label_update(key);
+		}
 	}
 	jump_label_unlock();
 }
-- 
cgit v1.2.3


From b10abd0a8859493a93c6b8020f2be2587557749d Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 20 Mar 2019 20:34:23 -0400
Subject: sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu

Recently I added an RCU annotation check to rcu_assign_pointer(). All
pointers assigned to RCU protected data are to be annotated with __rcu
inorder to be able to use rcu_assign_pointer() similar to checks in
other RCU APIs.

This resulted in a sparse error:

  kernel//sched/cpufreq.c:41:9: sparse: error: incompatible types in comparison expression (different address spaces)

Fix this by annotating cpufreq_update_util_data pointer with __rcu. This
will also help sparse catch any future RCU misuage bugs.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ From an RCU perspective. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: keescook@chromium.org
Cc: kernel-hardening@lists.openwall.com
Cc: kernel-team@android.com
Link: https://lkml.kernel.org/r/20190321003426.160260-2-joel@joelfernandes.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpufreq.c | 2 +-
 kernel/sched/sched.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 835671f0f917..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
  */
 #include "sched.h"
 
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..713715dd00cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_update_util - Take a note about CPU utilization changes.
-- 
cgit v1.2.3


From 994aeb7a93e43d28f6074195ccb03a384342e1bf Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 20 Mar 2019 20:34:24 -0400
Subject: sched_domain: Annotate RCU pointers properly

The scheduler uses RCU API in various places to access sched_domain
pointers. These cause sparse errors as below.

Many new errors show up because of an annotation check I added to
rcu_assign_pointer(). Let us annotate the pointers correctly which also
will help sparse catch any potential future bugs.

This fixes the following sparse errors:

  rt.c:1681:9: error: incompatible types in comparison expression
  deadline.c:1904:9: error: incompatible types in comparison expression
  core.c:519:9: error: incompatible types in comparison expression
  core.c:1634:17: error: incompatible types in comparison expression
  fair.c:6193:14: error: incompatible types in comparison expression
  fair.c:9883:22: error: incompatible types in comparison expression
  fair.c:9897:9: error: incompatible types in comparison expression
  sched.h:1287:9: error: incompatible types in comparison expression
  topology.c:612:9: error: incompatible types in comparison expression
  topology.c:615:9: error: incompatible types in comparison expression
  sched.h:1300:9: error: incompatible types in comparison expression
  topology.c:618:9: error: incompatible types in comparison expression
  sched.h:1287:9: error: incompatible types in comparison expression
  topology.c:621:9: error: incompatible types in comparison expression
  sched.h:1300:9: error: incompatible types in comparison expression
  topology.c:624:9: error: incompatible types in comparison expression
  topology.c:671:9: error: incompatible types in comparison expression
  stats.c:45:17: error: incompatible types in comparison expression
  fair.c:5998:15: error: incompatible types in comparison expression
  fair.c:5989:15: error: incompatible types in comparison expression
  fair.c:5998:15: error: incompatible types in comparison expression
  fair.c:5989:15: error: incompatible types in comparison expression
  fair.c:6120:19: error: incompatible types in comparison expression
  fair.c:6506:14: error: incompatible types in comparison expression
  fair.c:6515:14: error: incompatible types in comparison expression
  fair.c:6623:9: error: incompatible types in comparison expression
  fair.c:5970:17: error: incompatible types in comparison expression
  fair.c:8642:21: error: incompatible types in comparison expression
  fair.c:9253:9: error: incompatible types in comparison expression
  fair.c:9331:9: error: incompatible types in comparison expression
  fair.c:9519:15: error: incompatible types in comparison expression
  fair.c:9533:14: error: incompatible types in comparison expression
  fair.c:9542:14: error: incompatible types in comparison expression
  fair.c:9567:14: error: incompatible types in comparison expression
  fair.c:9597:14: error: incompatible types in comparison expression
  fair.c:9421:16: error: incompatible types in comparison expression
  fair.c:9421:16: error: incompatible types in comparison expression

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ From an RCU perspective. ]
Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: keescook@chromium.org
Cc: kernel-hardening@lists.openwall.com
Cc: kernel-team@android.com
Link: https://lkml.kernel.org/r/20190321003426.160260-3-joel@joelfernandes.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h    | 14 +++++++-------
 kernel/sched/topology.c | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 713715dd00cf..2b452d68ab2e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -869,8 +869,8 @@ struct rq {
 	atomic_t		nr_iowait;
 
 #ifdef CONFIG_SMP
-	struct root_domain	*rd;
-	struct sched_domain	*sd;
+	struct root_domain		*rd;
+	struct sched_domain __rcu	*sd;
 
 	unsigned long		cpu_capacity;
 	unsigned long		cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
 
 struct sched_group_capacity {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ab7f371a3a17..64bec54ded3e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
  * the cpumask of the domain), this allows us to quickly tell if
  * two CPUs are in the same cache domain, see cpus_share_cache().
  */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
-- 
cgit v1.2.3


From 7ba7319f9e3898101bff5d63cbae5a6cc174c8c9 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 20 Mar 2019 20:34:26 -0400
Subject: sched/core: Annotate perf_domain pointer with __rcu

This fixes the following sparse errors in sched/fair.c:

  fair.c:6506:14: error: incompatible types in comparison expression
  fair.c:8642:21: error: incompatible types in comparison expression

Using __rcu will also help sparse catch any future bugs.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ From an RCU perspective. ]
Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: keescook@chromium.org
Cc: kernel-hardening@lists.openwall.com
Cc: kernel-team@android.com
Link: https://lkml.kernel.org/r/20190321003426.160260-5-joel@joelfernandes.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b452d68ab2e..b52ed1ada0be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
 	 * NULL-terminated list of performance domains intersecting with the
 	 * CPUs of the rd. Protected by RCU.
 	 */
-	struct perf_domain	*pd;
+	struct perf_domain __rcu *pd;
 };
 
 extern struct root_domain def_root_domain;
-- 
cgit v1.2.3


From 71b47eaf6fb29b7f9722dc1646c26eb8a96e0a6d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 20 Mar 2019 21:38:39 +0800
Subject: sched/fair: Make sync_entity_load_avg() and remove_entity_load_avg()
 static

Fix these sparse warnigs:

  kernel/sched/fair.c:3570:6: warning: symbol 'sync_entity_load_avg' was not declared. Should it be static?
  kernel/sched/fair.c:3583:6: warning: symbol 'remove_entity_load_avg' was not declared. Should it be static?

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190320133839.21392-1-yuehaibing@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40bd1e27b1b7..ed7f5f8107b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3567,7 +3567,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
  * Synchronize entity load avg of dequeued entity without locking
  * the previous rq.
  */
-void sync_entity_load_avg(struct sched_entity *se)
+static void sync_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 last_update_time;
@@ -3580,7 +3580,7 @@ void sync_entity_load_avg(struct sched_entity *se)
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
  */
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	unsigned long flags;
-- 
cgit v1.2.3


From 46ad0840b1584b92b5ff2cc3ed0b011dd6b8e0f1 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 22 Mar 2019 10:30:06 -0400
Subject: locking/rwsem: Remove arch specific rwsem files

As the generic rwsem-xadd code is using the appropriate acquire and
release versions of the atomic operations, the arch specific rwsem.h
files will not be that much faster than the generic code as long as the
atomic functions are properly implemented. So we can remove those arch
specific rwsem.h and stop building asm/rwsem.h to reduce maintenance
effort.

Currently, only x86, alpha and ia64 have implemented architecture
specific fast paths. I don't have access to alpha and ia64 systems for
testing, but they are legacy systems that are not likely to be updated
to the latest kernel anyway.

By using a rwsem microbenchmark, the total locking rates on a 4-socket
56-core 112-thread x86-64 system before and after the patch were as
follows (mixed means equal # of read and write locks):

                      Before Patch              After Patch
   # of Threads  wlock   rlock   mixed     wlock   rlock   mixed
   ------------  -----   -----   -----     -----   -----   -----
        1        29,201  30,143  29,458    28,615  30,172  29,201
        2         6,807  13,299   1,171     7,725  15,025   1,804
        4         6,504  12,755   1,520     7,127  14,286   1,345
        8         6,762  13,412     764     6,826  13,652     726
       16         6,693  15,408     662     6,599  15,938     626
       32         6,145  15,286     496     5,549  15,487     511
       64         5,812  15,495      60     5,858  15,572      60

There were some run-to-run variations for the multi-thread tests. For
x86-64, using the generic C code fast path seems to be a little bit
faster than the assembly version with low lock contention.  Looking at
the assembly version of the fast paths, there are assembly to/from C
code wrappers that save and restore all the callee-clobbered registers
(7 registers on x86-64). The assembly generated from the generic C
code doesn't need to do that. That may explain the slight performance
gain here.

The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h
with no code change as no other code other than those under
kernel/locking needs to access the internal rwsem macros and functions.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-riscv@lists.infradead.org
Cc: linux-um@lists.infradead.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: nios2-dev@lists.rocketboards.org
Cc: openrisc@lists.librecores.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Link: https://lkml.kernel.org/r/20190322143008.21313-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/percpu-rwsem.c |   2 +
 kernel/locking/rwsem.h        | 130 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
+#include "rwsem.h"
+
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..067e265fa5c1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -32,6 +32,26 @@
 # define DEBUG_RWSEMS_WARN_ON(c)
 #endif
 
+/*
+ * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
+ * Adapted largely from include/asm-i386/rwsem.h
+ * by Paul Mackerras <paulus@samba.org>.
+ */
+
+/*
+ * the semaphore definition
+ */
+#ifdef CONFIG_64BIT
+# define RWSEM_ACTIVE_MASK		0xffffffffL
+#else
+# define RWSEM_ACTIVE_MASK		0x0000ffffL
+#endif
+
+#define RWSEM_ACTIVE_BIAS		0x00000001L
+#define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
  * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +152,113 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 }
 #endif
+
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0))
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+			return -EINTR;
+	}
+
+	return 0;
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	while ((tmp = atomic_long_read(&sem->count)) >= 0) {
+		if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+					     &sem->count);
+	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+					     &sem->count);
+	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+	return 0;
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	tmp = atomic_long_dec_return_release(&sem->count);
+	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
+						    &sem->count) < 0))
+		rwsem_wake(sem);
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	/*
+	 * When downgrading from exclusive to shared ownership,
+	 * anything inside the write-locked region cannot leak
+	 * into the read side. In contrast, anything in the
+	 * read-locked region is ok to be re-ordered into the
+	 * write side. As such, rely on RELEASE semantics.
+	 */
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */
-- 
cgit v1.2.3


From 390a0c62c23cb026cd4664a66f6f45fed3a215f6 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 22 Mar 2019 10:30:07 -0400
Subject: locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all
 archs

Currently, we have two different implementation of rwsem:

 1) CONFIG_RWSEM_GENERIC_SPINLOCK (rwsem-spinlock.c)
 2) CONFIG_RWSEM_XCHGADD_ALGORITHM (rwsem-xadd.c)

As we are going to use a single generic implementation for rwsem-xadd.c
and no architecture-specific code will be needed, there is no point
in keeping two different implementations of rwsem. In most cases, the
performance of rwsem-spinlock.c will be worse. It also doesn't get all
the performance tuning and optimizations that had been implemented in
rwsem-xadd.c over the years.

For simplication, we are going to remove rwsem-spinlock.c and make all
architectures use a single implementation of rwsem - rwsem-xadd.c.

All references to RWSEM_GENERIC_SPINLOCK and RWSEM_XCHGADD_ALGORITHM
in the code are removed.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-riscv@lists.infradead.org
Cc: linux-um@lists.infradead.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: nios2-dev@lists.rocketboards.org
Cc: openrisc@lists.librecores.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Link: https://lkml.kernel.org/r/20190322143008.21313-3-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/Kconfig.locks            |   2 +-
 kernel/locking/Makefile         |   4 +-
 kernel/locking/rwsem-spinlock.c | 339 ----------------------------------------
 kernel/locking/rwsem.h          |   3 -
 4 files changed, 2 insertions(+), 346 deletions(-)
 delete mode 100644 kernel/locking/rwsem-spinlock.c

(limited to 'kernel')

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index fbba478ae522..e335953fa704 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER
 
 config RWSEM_SPIN_ON_OWNER
        def_bool y
-       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
 
 config LOCK_SPIN_ON_OWNER
        def_bool y
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..1af83e9ce57d 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
 # and is generally not a function of system call inputs.
 KCOV_INSTRUMENT		:= n
 
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,6 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001   David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
-	RWSEM_WAITING_FOR_WRITE,
-	RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
-	struct list_head list;
-	struct task_struct *task;
-	enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	int ret = 1;
-	unsigned long flags;
-
-	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-		ret = (sem->count != 0);
-		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-		  struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	/*
-	 * Make sure we are not reinitializing a held semaphore:
-	 */
-	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
-	lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
-	sem->count = 0;
-	raw_spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- *   - the 'active count' _reached_ zero
- *   - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
-	struct rwsem_waiter *waiter;
-	struct task_struct *tsk;
-	int woken;
-
-	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
-	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		if (wakewrite)
-			/* Wake up a writer. Note that we do not grant it the
-			 * lock - it will have to acquire it when it runs. */
-			wake_up_process(waiter->task);
-		goto out;
-	}
-
-	/* grant an infinite number of read locks to the front of the queue */
-	woken = 0;
-	do {
-		struct list_head *next = waiter->list.next;
-
-		list_del(&waiter->list);
-		tsk = waiter->task;
-		/*
-		 * Make sure we do not wakeup the next reader before
-		 * setting the nil condition to grant the next reader;
-		 * otherwise we could miss the wakeup on the other
-		 * side and end up sleeping again. See the pairing
-		 * in rwsem_down_read_failed().
-		 */
-		smp_mb();
-		waiter->task = NULL;
-		wake_up_process(tsk);
-		put_task_struct(tsk);
-		woken++;
-		if (next == &sem->wait_list)
-			break;
-		waiter = list_entry(next, struct rwsem_waiter, list);
-	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
-	sem->count += woken;
-
- out:
-	return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
-	struct rwsem_waiter *waiter;
-
-	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	wake_up_process(waiter->task);
-
-	return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-int __sched __down_read_common(struct rw_semaphore *sem, int state)
-{
-	struct rwsem_waiter waiter;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-		/* granted */
-		sem->count++;
-		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-		goto out;
-	}
-
-	/* set up my own style of waitqueue */
-	waiter.task = current;
-	waiter.type = RWSEM_WAITING_FOR_READ;
-	get_task_struct(current);
-
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* wait to be given the lock */
-	for (;;) {
-		if (!waiter.task)
-			break;
-		if (signal_pending_state(state, current))
-			goto out_nolock;
-		set_current_state(state);
-		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-		schedule();
-		raw_spin_lock_irqsave(&sem->wait_lock, flags);
-	}
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- out:
-	return 0;
-
-out_nolock:
-	/*
-	 * We didn't take the lock, so that there is a writer, which
-	 * is owner or the first waiter of the sem. If it's a waiter,
-	 * it will be woken by current owner. Not need to wake anybody.
-	 */
-	list_del(&waiter.list);
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-	return -EINTR;
-}
-
-void __sched __down_read(struct rw_semaphore *sem)
-{
-	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_read_killable(struct rw_semaphore *sem)
-{
-	return __down_read_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-		/* granted */
-		sem->count++;
-		ret = 1;
-	}
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
-	struct rwsem_waiter waiter;
-	unsigned long flags;
-	int ret = 0;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	/* set up my own style of waitqueue */
-	waiter.task = current;
-	waiter.type = RWSEM_WAITING_FOR_WRITE;
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* wait for someone to release the lock */
-	for (;;) {
-		/*
-		 * That is the key to support write lock stealing: allows the
-		 * task already on CPU to get the lock soon rather than put
-		 * itself into sleep and waiting for system woke it or someone
-		 * else in the head of the wait list up.
-		 */
-		if (sem->count == 0)
-			break;
-		if (signal_pending_state(state, current))
-			goto out_nolock;
-
-		set_current_state(state);
-		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-		schedule();
-		raw_spin_lock_irqsave(&sem->wait_lock, flags);
-	}
-	/* got the lock */
-	sem->count = -1;
-	list_del(&waiter.list);
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	return ret;
-
-out_nolock:
-	list_del(&waiter.list);
-	if (!list_empty(&sem->wait_list) && sem->count >= 0)
-		__rwsem_do_wake(sem, 0);
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	return -EINTR;
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
-	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
-	return __down_write_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	if (sem->count == 0) {
-		/* got the lock */
-		sem->count = -1;
-		ret = 1;
-	}
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	if (--sem->count == 0 && !list_empty(&sem->wait_list))
-		sem = __rwsem_wake_one_writer(sem);
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	sem->count = 0;
-	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 1);
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
-	sem->count = 1;
-	if (!list_empty(&sem->wait_list))
-		sem = __rwsem_do_wake(sem, 0);
-
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 067e265fa5c1..45ee00236e03 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -153,7 +153,6 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 }
 #endif
 
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
 /*
  * lock for reading
  */
@@ -260,5 +259,3 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
 }
-
-#endif /* CONFIG_RWSEM_XCHGADD_ALGORITHM */
-- 
cgit v1.2.3


From ddb20d1d3aed8f130519c0a29cd5392efcc067b8 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 22 Mar 2019 10:30:08 -0400
Subject: locking/rwsem: Optimize down_read_trylock()

Modify __down_read_trylock() to optimize for an unlocked rwsem and make
it generate slightly better code.

Before this patch, down_read_trylock:

   0x0000000000000000 <+0>:     callq  0x5 <down_read_trylock+5>
   0x0000000000000005 <+5>:     jmp    0x18 <down_read_trylock+24>
   0x0000000000000007 <+7>:     lea    0x1(%rdx),%rcx
   0x000000000000000b <+11>:    mov    %rdx,%rax
   0x000000000000000e <+14>:    lock cmpxchg %rcx,(%rdi)
   0x0000000000000013 <+19>:    cmp    %rax,%rdx
   0x0000000000000016 <+22>:    je     0x23 <down_read_trylock+35>
   0x0000000000000018 <+24>:    mov    (%rdi),%rdx
   0x000000000000001b <+27>:    test   %rdx,%rdx
   0x000000000000001e <+30>:    jns    0x7 <down_read_trylock+7>
   0x0000000000000020 <+32>:    xor    %eax,%eax
   0x0000000000000022 <+34>:    retq
   0x0000000000000023 <+35>:    mov    %gs:0x0,%rax
   0x000000000000002c <+44>:    or     $0x3,%rax
   0x0000000000000030 <+48>:    mov    %rax,0x20(%rdi)
   0x0000000000000034 <+52>:    mov    $0x1,%eax
   0x0000000000000039 <+57>:    retq

After patch, down_read_trylock:

   0x0000000000000000 <+0>:	callq  0x5 <down_read_trylock+5>
   0x0000000000000005 <+5>:	xor    %eax,%eax
   0x0000000000000007 <+7>:	lea    0x1(%rax),%rdx
   0x000000000000000b <+11>:	lock cmpxchg %rdx,(%rdi)
   0x0000000000000010 <+16>:	jne    0x29 <down_read_trylock+41>
   0x0000000000000012 <+18>:	mov    %gs:0x0,%rax
   0x000000000000001b <+27>:	or     $0x3,%rax
   0x000000000000001f <+31>:	mov    %rax,0x20(%rdi)
   0x0000000000000023 <+35>:	mov    $0x1,%eax
   0x0000000000000028 <+40>:	retq
   0x0000000000000029 <+41>:	test   %rax,%rax
   0x000000000000002c <+44>:	jns    0x7 <down_read_trylock+7>
   0x000000000000002e <+46>:	xor    %eax,%eax
   0x0000000000000030 <+48>:	retq

By using a rwsem microbenchmark, the down_read_trylock() rate (with a
load of 10 to lengthen the lock critical section) on a x86-64 system
before and after the patch were:

                 Before Patch    After Patch
   # of Threads     rlock           rlock
   ------------     -----           -----
        1           14,496          14,716
        2            8,644           8,453
	4            6,799           6,983
	8            5,664           7,190

On a ARM64 system, the performance results were:

                 Before Patch    After Patch
   # of Threads     rlock           rlock
   ------------     -----           -----
        1           23,676          24,488
        2            7,697           9,502
        4            4,945           3,440
        8            2,641           1,603

For the uncontended case (1 thread), the new down_read_trylock() is a
little bit faster. For the contended cases, the new down_read_trylock()
perform pretty well in x86-64, but performance degrades at high
contention level on ARM64.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-c6x-dev@linux-c6x.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-riscv@lists.infradead.org
Cc: linux-um@lists.infradead.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: nios2-dev@lists.rocketboards.org
Cc: openrisc@lists.librecores.org
Cc: uclinux-h8-devel@lists.sourceforge.jp
Link: https://lkml.kernel.org/r/20190322143008.21313-4-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 45ee00236e03..1f5775aa6a1d 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -174,14 +174,17 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
 
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
-	long tmp;
+	/*
+	 * Optimize for the case when the rwsem is not locked at all.
+	 */
+	long tmp = RWSEM_UNLOCKED_VALUE;
 
-	while ((tmp = atomic_long_read(&sem->count)) >= 0) {
-		if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp,
-				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+	do {
+		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+					tmp + RWSEM_ACTIVE_READ_BIAS)) {
 			return 1;
 		}
-	}
+	} while (tmp >= 0);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:40 -0700
Subject: bpf: add verifier stats and log_level bit 2

In order to understand the verifier bottlenecks add various stats
and extend log_level:
log_level 1 and 2 are kept as-is:
bit 0 - level=1 - print every insn and verifier state at branch points
bit 1 - level=2 - print every insn and verifier state at every insn
bit 2 - level=4 - print verifier error and stats at the end of verification

When verifier rejects the program the libbpf is trying to load the program twice.
Once with log_level=0 (no messages, only error code is reported to user space)
and second time with log_level=1 to tell the user why the verifier rejected it.

With introduction of bit 2 - level=4 the libbpf can choose to always use that
level and load programs once, since the verification speed is not affected and
in case of error the verbose message will be available.

Note that the verifier stats are not part of uapi just like all other
verbose messages. They're expected to change in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 87221fda1321..e2001c1e40b3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1092,7 +1092,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	 */
 	subprog[env->subprog_cnt].start = insn_cnt;
 
-	if (env->log.level > 1)
+	if (env->log.level & BPF_LOG_LEVEL2)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
@@ -1139,6 +1139,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 			 struct bpf_reg_state *parent)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
+	int cnt = 0;
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
@@ -1155,7 +1156,11 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 		state = parent;
 		parent = state->parent;
 		writes = true;
+		cnt++;
 	}
+
+	if (env->longest_mark_read_walk < cnt)
+		env->longest_mark_read_walk = cnt;
 	return 0;
 }
 
@@ -1455,7 +1460,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (env->log.level)
+	if (env->log.level & BPF_LOG_LEVEL)
 		print_verifier_state(env, state);
 
 	/* The minimum value is only important with signed
@@ -2938,7 +2943,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* and go analyze first insn of the callee */
 	*insn_idx = target_insn;
 
-	if (env->log.level) {
+	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "caller:\n");
 		print_verifier_state(env, caller);
 		verbose(env, "callee:\n");
@@ -2978,7 +2983,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		return err;
 
 	*insn_idx = callee->callsite + 1;
-	if (env->log.level) {
+	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "returning from callee:\n");
 		print_verifier_state(env, callee);
 		verbose(env, "to caller at %d:\n", *insn_idx);
@@ -5001,7 +5006,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			insn->dst_reg);
 		return -EACCES;
 	}
-	if (env->log.level)
+	if (env->log.level & BPF_LOG_LEVEL)
 		print_verifier_state(env, this_branch->frame[this_branch->curframe]);
 	return 0;
 }
@@ -6181,6 +6186,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		states_cnt++;
 	}
 
+	if (env->max_states_per_insn < states_cnt)
+		env->max_states_per_insn = states_cnt;
+
 	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
 		return 0;
 
@@ -6194,6 +6202,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
+	env->total_states++;
+	env->peak_states++;
 
 	/* add new state to the head of linked list */
 	new = &new_sl->state;
@@ -6278,8 +6288,7 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_reg_state *regs;
-	int insn_cnt = env->prog->len, i;
-	int insn_processed = 0;
+	int insn_cnt = env->prog->len;
 	bool do_print_state = false;
 
 	env->prev_linfo = NULL;
@@ -6314,10 +6323,10 @@ static int do_check(struct bpf_verifier_env *env)
 		insn = &insns[env->insn_idx];
 		class = BPF_CLASS(insn->code);
 
-		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
+		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
 			verbose(env,
 				"BPF program is too large. Processed %d insn\n",
-				insn_processed);
+				env->insn_processed);
 			return -E2BIG;
 		}
 
@@ -6326,7 +6335,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (env->log.level) {
+			if (env->log.level & BPF_LOG_LEVEL) {
 				if (do_print_state)
 					verbose(env, "\nfrom %d to %d%s: safe\n",
 						env->prev_insn_idx, env->insn_idx,
@@ -6344,8 +6353,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (env->log.level > 1 || (env->log.level && do_print_state)) {
-			if (env->log.level > 1)
+		if (env->log.level & BPF_LOG_LEVEL2 ||
+		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
+			if (env->log.level & BPF_LOG_LEVEL2)
 				verbose(env, "%d:", env->insn_idx);
 			else
 				verbose(env, "\nfrom %d to %d%s:",
@@ -6356,7 +6366,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (env->log.level) {
+		if (env->log.level & BPF_LOG_LEVEL) {
 			const struct bpf_insn_cbs cbs = {
 				.cb_print	= verbose,
 				.private_data	= env,
@@ -6621,16 +6631,6 @@ process_bpf_exit:
 		env->insn_idx++;
 	}
 
-	verbose(env, "processed %d insns (limit %d), stack depth ",
-		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt; i++) {
-		u32 depth = env->subprog_info[i].stack_depth;
-
-		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt)
-			verbose(env, "+");
-	}
-	verbose(env, "\n");
 	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
@@ -7854,9 +7854,34 @@ static void free_states(struct bpf_verifier_env *env)
 	kfree(env->explored_states);
 }
 
+static void print_verification_stats(struct bpf_verifier_env *env)
+{
+	int i;
+
+	if (env->log.level & BPF_LOG_STATS) {
+		verbose(env, "verification time %lld usec\n",
+			div_u64(env->verification_time, 1000));
+		verbose(env, "stack depth ");
+		for (i = 0; i < env->subprog_cnt; i++) {
+			u32 depth = env->subprog_info[i].stack_depth;
+
+			verbose(env, "%d", depth);
+			if (i + 1 < env->subprog_cnt)
+				verbose(env, "+");
+		}
+		verbose(env, "\n");
+	}
+	verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
+		"total_states %d peak_states %d mark_read %d\n",
+		env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
+		env->max_states_per_insn, env->total_states,
+		env->peak_states, env->longest_mark_read_walk);
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	      union bpf_attr __user *uattr)
 {
+	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
 	struct bpf_verifier_log *log;
 	int i, len, ret = -EINVAL;
@@ -7899,7 +7924,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		ret = -EINVAL;
 		/* log attributes have to be sane */
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
-		    !log->level || !log->ubuf)
+		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
 			goto err_unlock;
 	}
 
@@ -7980,6 +8005,9 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
+	env->verification_time = ktime_get_ns() - start_time;
+	print_verification_stats(env);
+
 	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
 	if (log->level && !log->ubuf) {
-- 
cgit v1.2.3


From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:41 -0700
Subject: bpf: improve verification speed by droping states

Branch instructions, branch targets and calls in a bpf program are
the places where the verifier remembers states that led to successful
verification of the program.
These states are used to prune brute force program analysis.
For unprivileged programs there is a limit of 64 states per such
'branching' instructions (maximum length is tracked by max_states_per_insn
counter introduced in the previous patch).
Simply reducing this threshold to 32 or lower increases insn_processed
metric to the point that small valid programs get rejected.
For root programs there is no limit and cilium programs can have
max_states_per_insn to be 100 or higher.
Walking 100+ states multiplied by number of 'branching' insns during
verification consumes significant amount of cpu time.
Turned out simple LRU-like mechanism can be used to remove states
that unlikely will be helpful in future search pruning.
This patch introduces hit_cnt and miss_cnt counters:
hit_cnt - this many times this state successfully pruned the search
miss_cnt - this many times this state was not equivalent to other states
(and that other states were added to state list)

The heuristic introduced in this patch is:
if (sl->miss_cnt > sl->hit_cnt * 3 + 3)
  /* drop this state from future considerations */

Higher numbers increase max_states_per_insn (allow more states to be
considered for pruning) and slow verification speed, but do not meaningfully
reduce insn_processed metric.
Lower numbers drop too many states and insn_processed increases too much.
Many different formulas were considered.
This one is simple and works well enough in practice.
(the analysis was done on selftests/progs/* and on cilium programs)

The end result is this heuristic improves verification speed by 10 times.
Large synthetic programs that used to take a second more now take
1/10 of a second.
In cases where max_states_per_insn used to be 100 or more, now it's ~10.

There is a slight increase in insn_processed for cilium progs:
                       before   after
bpf_lb-DLB_L3.o 	1831	1838
bpf_lb-DLB_L4.o 	3029	3218
bpf_lb-DUNKNOWN.o 	1064	1064
bpf_lxc-DDROP_ALL.o	26309	26935
bpf_lxc-DUNKNOWN.o	33517	34439
bpf_netdev.o		9713	9721
bpf_overlay.o		6184	6184
bpf_lcx_jit.o		37335	39389
And 2-3 times improvement in the verification speed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e2001c1e40b3..a636db4a7a4e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6152,11 +6152,13 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
-	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
 
-	sl = env->explored_states[insn_idx];
+	pprev = &env->explored_states[insn_idx];
+	sl = *pprev;
+
 	if (!sl)
 		/* this 'insn_idx' instruction wasn't marked, so we will not
 		 * be doing state search here
@@ -6167,6 +6169,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 
 	while (sl != STATE_LIST_MARK) {
 		if (states_equal(env, &sl->state, cur)) {
+			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -6182,8 +6185,35 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				return err;
 			return 1;
 		}
-		sl = sl->next;
 		states_cnt++;
+		sl->miss_cnt++;
+		/* heuristic to determine whether this state is beneficial
+		 * to keep checking from state equivalence point of view.
+		 * Higher numbers increase max_states_per_insn and verification time,
+		 * but do not meaningfully decrease insn_processed.
+		 */
+		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+			/* the state is unlikely to be useful. Remove it to
+			 * speed up verification
+			 */
+			*pprev = sl->next;
+			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+				free_verifier_state(&sl->state, false);
+				kfree(sl);
+				env->peak_states--;
+			} else {
+				/* cannot free this state, since parentage chain may
+				 * walk it later. Add it for free_list instead to
+				 * be freed at the end of verification
+				 */
+				sl->next = env->free_list;
+				env->free_list = sl;
+			}
+			sl = *pprev;
+			continue;
+		}
+		pprev = &sl->next;
+		sl = *pprev;
 	}
 
 	if (env->max_states_per_insn < states_cnt)
@@ -7836,6 +7866,14 @@ static void free_states(struct bpf_verifier_env *env)
 	struct bpf_verifier_state_list *sl, *sln;
 	int i;
 
+	sl = env->free_list;
+	while (sl) {
+		sln = sl->next;
+		free_verifier_state(&sl->state, false);
+		kfree(sl);
+		sl = sln;
+	}
+
 	if (!env->explored_states)
 		return;
 
-- 
cgit v1.2.3


From 25af32dad8047d180e70e233c85b909dd6587cc5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:42 -0700
Subject: bpf: improve verification speed by not remarking live_read

With large verifier speed improvement brought by the previous patch
mark_reg_read() becomes the hottest function during verification.
On a typical program it consumes 40% of cpu.
mark_reg_read() walks parentage chain of registers to mark parents as LIVE_READ.
Once the register is marked there is no need to remark it again in the future.
Hence stop walking the chain once first LIVE_READ is seen.
This optimization drops mark_reg_read() time from 40% of cpu to <1%
and overall 2x improvement of verification speed.
For some programs the longest_mark_read_walk counter improves from ~500 to ~5

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a636db4a7a4e..94cf6efc5df6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1151,6 +1151,15 @@ static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
+		if (parent->live & REG_LIVE_READ)
+			/* The parentage chain never changes and
+			 * this parent was already marked as LIVE_READ.
+			 * There is no need to keep walking the chain again and
+			 * keep re-marking all parents as LIVE_READ.
+			 * This case happens when the same register is read
+			 * multiple times without writes into it in-between.
+			 */
+			break;
 		/* ... then we depend on parent's value */
 		parent->live |= REG_LIVE_READ;
 		state = parent;
-- 
cgit v1.2.3


From 71dde681a8cea1ccff2c7b3be83c043ab6b2a977 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:43 -0700
Subject: bpf: convert temp arrays to kvcalloc

Temporary arrays used during program verification need to be vmalloc-ed
to support large bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 94cf6efc5df6..ad3494a881da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5313,13 +5313,13 @@ static int check_cfg(struct bpf_verifier_env *env)
 	int ret = 0;
 	int i, t;
 
-	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
 
-	insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_stack) {
-		kfree(insn_state);
+		kvfree(insn_state);
 		return -ENOMEM;
 	}
 
@@ -5417,8 +5417,8 @@ check_state:
 	ret = 0; /* cfg looks good */
 
 err_free:
-	kfree(insn_state);
-	kfree(insn_stack);
+	kvfree(insn_state);
+	kvfree(insn_stack);
 	return ret;
 }
 
@@ -7898,7 +7898,7 @@ static void free_states(struct bpf_verifier_env *env)
 			}
 	}
 
-	kfree(env->explored_states);
+	kvfree(env->explored_states);
 }
 
 static void print_verification_stats(struct bpf_verifier_env *env)
@@ -7994,7 +7994,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 			goto skip_full_check;
 	}
 
-	env->explored_states = kcalloc(env->prog->len,
+	env->explored_states = kvcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
 	ret = -ENOMEM;
-- 
cgit v1.2.3


From 4f73379ec5c2891598aa715c6df7ac9afdc86fbf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:44 -0700
Subject: bpf: verbose jump offset overflow check

Larger programs may trigger 16-bit jump offset overflow check
during instruction patching. Make this error verbose otherwise
users cannot decipher error code without printks in the verifier.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c     | 11 ++++++-----
 kernel/bpf/verifier.c |  7 ++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..2966cb368bf4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -438,6 +438,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
 	const u32 cnt_max = S16_MAX;
 	struct bpf_prog *prog_adj;
+	int err;
 
 	/* Since our patchlet doesn't expand the image, we're done. */
 	if (insn_delta == 0) {
@@ -453,8 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	 * we afterwards may not fail anymore.
 	 */
 	if (insn_adj_cnt > cnt_max &&
-	    bpf_adj_branches(prog, off, off + 1, off + len, true))
-		return NULL;
+	    (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
+		return ERR_PTR(err);
 
 	/* Several new instructions need to be inserted. Make room
 	 * for them. Likely, there's no need for a new allocation as
@@ -463,7 +464,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
 				    GFP_USER);
 	if (!prog_adj)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	prog_adj->len = insn_adj_cnt;
 
@@ -1096,13 +1097,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 			continue;
 
 		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
-		if (!tmp) {
+		if (IS_ERR(tmp)) {
 			/* Patching may have repointed aux->prog during
 			 * realloc from the original one, so we need to
 			 * fix it up here on error.
 			 */
 			bpf_jit_prog_release_other(prog, clone);
-			return ERR_PTR(-ENOMEM);
+			return tmp;
 		}
 
 		clone = tmp;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ad3494a881da..6dcfeb44bb8e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6932,8 +6932,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	struct bpf_prog *new_prog;
 
 	new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
-	if (!new_prog)
+	if (IS_ERR(new_prog)) {
+		if (PTR_ERR(new_prog) == -ERANGE)
+			verbose(env,
+				"insn %d cannot be patched due to 16-bit range\n",
+				env->insn_aux_data[off].orig_idx);
 		return NULL;
+	}
 	if (adjust_insn_aux_data(env, new_prog->len, off, len))
 		return NULL;
 	adjust_subprog_starts(env, off, len);
-- 
cgit v1.2.3


From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:45 -0700
Subject: bpf: increase complexity limit and maximum program size

Large verifier speed improvements allow to increase
verifier complexity limit.
Now regardless of the program composition and its size it takes
little time for the verifier to hit insn_processed limit.
On typical x86 machine non-debug kernel processes 1M instructions
in 1/10 of a second.
(before these speed improvements specially crafted programs
could be hitting multi-second verification times)
Full kasan kernel with debug takes ~1 second for the same 1M insns.
Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M.
Also increase the number of instructions per program
from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit.
4k limit was confusing to users, since small programs with hundreds
of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit.
Sometimes adding more insns and bpf_trace_printk debug statements
would make the verifier accept the program while removing
code would make the verifier reject it.
Some user space application started to add #define MAX_FOO to
their programs and do:
  MAX_FOO=100;
again:
  compile with MAX_FOO;
  try to load;
  if (fails_to_load) { reduce MAX_FOO; goto again; }
to be able to fit maximum amount of processing into single program.
Other users artificially split their single program into a set of programs
and use all 32 iterations of tail_calls to increase compute limits.
And the most advanced folks used unlimited tc-bpf filter list
to execute many bpf programs.
Essentially the users managed to workaround 4k insn limit.
This patch removes the limit for root programs from uapi.
BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit
and success to load the program no longer depends on program size,
but on 'smartness' of the verifier only.
The verifier will continue to get smarter with every kernel release.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c  | 3 ++-
 kernel/bpf/verifier.c | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index afca36f53c49..1d65e56594db 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1557,7 +1557,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	is_gpl = license_is_gpl_compatible(license);
 
-	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
+	if (attr->insn_cnt == 0 ||
+	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6dcfeb44bb8e..b631e89e7a51 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem {
 	struct bpf_verifier_stack_elem *next;
 };
 
-#define BPF_COMPLEXITY_LIMIT_INSNS	131072
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 #define BPF_COMPLEXITY_LIMIT_STATES	64
 
-- 
cgit v1.2.3


From 7a9f5c65abcc9644b11738ca0815510cb5510eaf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:46 -0700
Subject: bpf: increase verifier log limit

The existing 16Mbyte verifier log limit is not enough for log_level=2
even for small programs. Increase it to 1Gbyte.
Note it's not a kernel memory limit.
It's an amount of memory user space provides to store
the verifier log. The kernel populates it 1k at a time.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b631e89e7a51..bb27b675923c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7974,7 +7974,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
-		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
 		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
 			goto err_unlock;
 	}
-- 
cgit v1.2.3


From d6e486ee0ef2f99a4069d9186e53dac61b28cb3c Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 3 Apr 2019 16:03:54 -0700
Subject: cgroup: remove extra cgroup_migrate_finish() call

The callers of cgroup_migrate_prepare_dst() correctly call
cgroup_migrate_finish() for success and failure cases both. No need to
call it in cgroup_migrate_prepare_dst() in failure case.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3f2b4bde0f9c..f219c195a9a5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2602,7 +2602,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
 		if (!dst_cset)
-			goto err;
+			return -ENOMEM;
 
 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
 
@@ -2634,9 +2634,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 	}
 
 	return 0;
-err:
-	cgroup_migrate_finish(mgctx);
-	return -ENOMEM;
 }
 
 /**
-- 
cgit v1.2.3


From 9419a3191dcb27f24478d288abaab697228d28e6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Apr 2019 21:04:13 -0400
Subject: acct_on(): don't mess with freeze protection

What happens there is that we are replacing file->path.mnt of
a file we'd just opened with a clone and we need the write
count contribution to be transferred from original mount to
new one.  That's it.  We do *NOT* want any kind of freeze
protection for the duration of switchover.

IOW, we should just use __mnt_{want,drop}_write() for that
switchover; no need to bother with mnt_{want,drop}_write()
there.

Tested-by: Amir Goldstein <amir73il@gmail.com>
Reported-by: syzbot+2a73a6ea9507b7112141@syzkaller.appspotmail.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/acct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index addf7732fb56..81f9831a7859 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return PTR_ERR(internal);
 	}
-	err = mnt_want_write(internal);
+	err = __mnt_want_write(internal);
 	if (err) {
 		mntput(internal);
 		kfree(acct);
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	mnt_drop_write(mnt);
+	__mnt_drop_write(mnt);
 	mntput(mnt);
 	return 0;
 }
-- 
cgit v1.2.3


From f2bcd05ec7b839ff826d2008506ad2d2dff46a59 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Apr 2019 23:22:37 -0700
Subject: bpf: Reject indirect var_off stack access in raw mode

It's hard to guarantee that whole memory is marked as initialized on
helper return if uninitialized stack is accessed with variable offset
since specific bounds are unknown to verifier. This may cause
uninitialized stack leaking.

Reject such an access in check_stack_boundary to prevent possible
leaking.

There are no known use-cases for indirect uninitialized stack access
with variable offset so it shouldn't break anything.

Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb27b675923c..0f12fda35626 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2226,6 +2226,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (err)
 			return err;
 	} else {
+		/* Only initialized buffer on stack is allowed to be accessed
+		 * with variable offset. With uninitialized buffer it's hard to
+		 * guarantee that whole memory is marked as initialized on
+		 * helper return since specific bounds are unknown what may
+		 * cause uninitialized stack leaking.
+		 */
+		if (meta && meta->raw_mode)
+			meta = NULL;
+
 		min_off = reg->smin_value + reg->off;
 		max_off = reg->umax_value + reg->off;
 		err = __check_stack_boundary(env, regno, min_off, access_size,
-- 
cgit v1.2.3


From 088ec26d9c2da9d879ab73e3f4117f9df6c566ee Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Apr 2019 23:22:39 -0700
Subject: bpf: Reject indirect var_off stack access in unpriv mode

Proper support of indirect stack access with variable offset in
unprivileged mode (!root) requires corresponding support in Spectre
masking for stack ALU in retrieve_ptr_limit().

There are no use-case for variable offset in unprivileged mode though so
make verifier reject such accesses for simplicity.

Pointer arithmetics is one (and only?) way to cause variable offset and
it's already rejected in unpriv mode so that verifier won't even get to
helper function whose argument contains variable offset, e.g.:

  0: (7a) *(u64 *)(r10 -16) = 0
  1: (7a) *(u64 *)(r10 -8) = 0
  2: (61) r2 = *(u32 *)(r1 +0)
  3: (57) r2 &= 4
  4: (17) r2 -= 16
  5: (0f) r2 += r10
  variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2
  stack pointer arithmetic goes out of range, prohibited for !root

Still it looks like a good idea to reject variable offset indirect stack
access for unprivileged mode in check_stack_boundary() explicitly.

Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0f12fda35626..8400c1f33cd4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2226,6 +2226,19 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (err)
 			return err;
 	} else {
+		/* Variable offset is prohibited for unprivileged mode for
+		 * simplicity since it requires corresponding support in
+		 * Spectre masking for stack ALU.
+		 * See also retrieve_ptr_limit().
+		 */
+		if (!env->allow_ptr_leaks) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
+				regno, tn_buf);
+			return -EACCES;
+		}
 		/* Only initialized buffer on stack is allowed to be accessed
 		 * with variable offset. With uninitialized buffer it's hard to
 		 * guarantee that whole memory is marked as initialized on
@@ -3339,6 +3352,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 
 	switch (ptr_reg->type) {
 	case PTR_TO_STACK:
+		/* Indirect variable offset stack access is prohibited in
+		 * unprivileged mode so it's not handled here.
+		 */
 		off = ptr_reg->off + ptr_reg->var_off.value;
 		if (mask_to_left)
 			*ptr_limit = MAX_BPF_STACK + off;
-- 
cgit v1.2.3


From 107c26a70ca81bfc33657366ad69d02fdc9efc9d Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Apr 2019 23:22:41 -0700
Subject: bpf: Sanity check max value for var_off stack access

As discussed in [1] max value of variable offset has to be checked for
overflow on stack access otherwise verifier would accept code like this:

  0: (b7) r2 = 6
  1: (b7) r3 = 28
  2: (7a) *(u64 *)(r10 -16) = 0
  3: (7a) *(u64 *)(r10 -8) = 0
  4: (79) r4 = *(u64 *)(r1 +168)
  5: (c5) if r4 s< 0x0 goto pc+4
   R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28
   R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0;
   0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm
  6: (17) r4 -= 16
  7: (0f) r4 += r10
  8: (b7) r5 = 8
  9: (85) call bpf_getsockopt#57
  10: (b7) r0 = 0
  11: (95) exit

, where R4 obviosly has unbounded max value.

Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF;
BPF_MAX_VAR_OFF) range.

reg->smax_value is used instead of reg->umax_value because stack
pointers are calculated using negative offset from fp. This is opposite
to e.g. map access where offset must be non-negative and where
umax_value is used.

Also dedicated verbose logs are added for both min and max bound check
failures to have diagnostics consistent with variable offset handling in
check_map_access().

[1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2

Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8400c1f33cd4..f2d600199e66 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2248,16 +2248,28 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (meta && meta->raw_mode)
 			meta = NULL;
 
+		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+		    reg->smax_value <= -BPF_MAX_VAR_OFF) {
+			verbose(env, "R%d unbounded indirect variable offset stack access\n",
+				regno);
+			return -EACCES;
+		}
 		min_off = reg->smin_value + reg->off;
-		max_off = reg->umax_value + reg->off;
+		max_off = reg->smax_value + reg->off;
 		err = __check_stack_boundary(env, regno, min_off, access_size,
 					     zero_size_allowed);
-		if (err)
+		if (err) {
+			verbose(env, "R%d min value is outside of stack bound\n",
+				regno);
 			return err;
+		}
 		err = __check_stack_boundary(env, regno, max_off, access_size,
 					     zero_size_allowed);
-		if (err)
+		if (err) {
+			verbose(env, "R%d max value is outside of stack bound\n",
+				regno);
 			return err;
+		}
 	}
 
 	if (meta && meta->raw_mode) {
-- 
cgit v1.2.3


From 1fbd20f8b77b366ea4aeb92ade72daa7f36a7e3b Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Apr 2019 23:22:43 -0700
Subject: bpf: Add missed newline in verifier verbose log

check_stack_access() that prints verbose log is used in
adjust_ptr_min_max_vals() that prints its own verbose log and now they
stick together, e.g.:

  variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16
  size=1R2 stack pointer arithmetic goes out of range, prohibited for
  !root

Add missing newline so that log is more readable:
  variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1
  R2 stack pointer arithmetic goes out of range, prohibited for !root

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f2d600199e66..48718e1da16d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1426,7 +1426,7 @@ static int check_stack_access(struct bpf_verifier_env *env,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "variable stack access var_off=%s off=%d size=%d",
+		verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
 			tn_buf, off, size);
 		return -EACCES;
 	}
-- 
cgit v1.2.3


From bfe83844987a52dc1f71f757b60523811502dc93 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 28 Mar 2019 16:13:35 +0100
Subject: genirq/timings: Remove variance computation code

The variance computation did not provide the expected results and will be
replaced with a different approach to compute the next interrupt based on
the array suffixes derived algorithm.

There is no good way to transform the variance code to the new algorithm,
so for ease of review remove the existing code first.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: rjw@rjwysocki.net
Cc: ulf.hansson@linaro.org
Cc: linux-pm@vger.kernel.org
Link: https://lkml.kernel.org/r/20190328151336.5316-1-daniel.lezcano@linaro.org
---
 kernel/irq/timings.c | 252 +--------------------------------------------------
 1 file changed, 2 insertions(+), 250 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..3cde046a2bc8 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -8,7 +8,6 @@
 #include <linux/interrupt.h>
 #include <linux/idr.h>
 #include <linux/irq.h>
-#include <linux/math64.h>
 
 #include <trace/events/irq.h>
 
@@ -19,13 +18,7 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
 
 struct irqt_stat {
-	u64	next_evt;
-	u64	last_ts;
-	u64	variance;
-	u32	avg;
-	u32	nr_samples;
-	int	anomalies;
-	int	valid;
+	u64     next_evt;
 };
 
 static DEFINE_IDR(irqt_stats);
@@ -40,184 +33,6 @@ void irq_timings_disable(void)
 	static_branch_disable(&irq_timing_enabled);
 }
 
-/**
- * irqs_update - update the irq timing statistics with a new timestamp
- *
- * @irqs: an irqt_stat struct pointer
- * @ts: the new timestamp
- *
- * The statistics are computed online, in other words, the code is
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
- *
- * The computation happens at idle time. When the CPU is not idle, the
- * interrupts' timestamps are stored in the circular buffer, when the
- * CPU goes idle and this routine is called, all the buffer's values
- * are injected in the statistical model continuying to extend the
- * statistics from the previous busy-idle cycle.
- *
- * The observations showed a device will trigger a burst of periodic
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
- *
- * Basically, the purpose of the algorithm is to watch the periodic
- * interrupts and eliminate the peaks.
- *
- * An interrupt is considered periodically stable if the interval of
- * its occurences follow the normal distribution, thus the values
- * comply with:
- *
- *      avg - 3 x stddev < value < avg + 3 x stddev
- *
- * Which can be simplified to:
- *
- *      -3 x stddev < value - avg < 3 x stddev
- *
- *      abs(value - avg) < 3 x stddev
- *
- * In order to save a costly square root computation, we use the
- * variance. For the record, stddev = sqrt(variance). The equation
- * above becomes:
- *
- *      abs(value - avg) < 3 x sqrt(variance)
- *
- * And finally we square it:
- *
- *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
- *
- *      (value - avg) x (value - avg) < 9 x variance
- *
- * Statistically speaking, any values out of this interval is
- * considered as an anomaly and is discarded. However, a normal
- * distribution appears when the number of samples is 30 (it is the
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
- *
- */
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
-{
-	u64 old_ts = irqs->last_ts;
-	u64 variance = 0;
-	u64 interval;
-	s64 diff;
-
-	/*
-	 * The timestamps are absolute time values, we need to compute
-	 * the timing interval between two interrupts.
-	 */
-	irqs->last_ts = ts;
-
-	/*
-	 * The interval type is u64 in order to deal with the same
-	 * type in our computation, that prevent mindfuck issues with
-	 * overflow, sign and division.
-	 */
-	interval = ts - old_ts;
-
-	/*
-	 * The interrupt triggered more than one second apart, that
-	 * ends the sequence as predictible for our purpose. In this
-	 * case, assume we have the beginning of a sequence and the
-	 * timestamp is the first value. As it is impossible to
-	 * predict anything at this point, return.
-	 *
-	 * Note the first timestamp of the sequence will always fall
-	 * in this test because the old_ts is zero. That is what we
-	 * want as we need another timestamp to compute an interval.
-	 */
-	if (interval >= NSEC_PER_SEC) {
-		memset(irqs, 0, sizeof(*irqs));
-		irqs->last_ts = ts;
-		return;
-	}
-
-	/*
-	 * Pre-compute the delta with the average as the result is
-	 * used several times in this function.
-	 */
-	diff = interval - irqs->avg;
-
-	/*
-	 * Increment the number of samples.
-	 */
-	irqs->nr_samples++;
-
-	/*
-	 * Online variance divided by the number of elements if there
-	 * is more than one sample.  Normally the formula is division
-	 * by nr_samples - 1 but we assume the number of element will be
-	 * more than 32 and dividing by 32 instead of 31 is enough
-	 * precise.
-	 */
-	if (likely(irqs->nr_samples > 1))
-		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
-
-	/*
-	 * The rule of thumb in statistics for the normal distribution
-	 * is having at least 30 samples in order to have the model to
-	 * apply. Values outside the interval are considered as an
-	 * anomaly.
-	 */
-	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
-		/*
-		 * After three consecutive anomalies, we reset the
-		 * stats as it is no longer stable enough.
-		 */
-		if (irqs->anomalies++ >= 3) {
-			memset(irqs, 0, sizeof(*irqs));
-			irqs->last_ts = ts;
-			return;
-		}
-	} else {
-		/*
-		 * The anomalies must be consecutives, so at this
-		 * point, we reset the anomalies counter.
-		 */
-		irqs->anomalies = 0;
-	}
-
-	/*
-	 * The interrupt is considered stable enough to try to predict
-	 * the next event on it.
-	 */
-	irqs->valid = 1;
-
-	/*
-	 * Online average algorithm:
-	 *
-	 *  new_average = average + ((value - average) / count)
-	 *
-	 * The variance computation depends on the new average
-	 * to be computed here first.
-	 *
-	 */
-	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
-
-	/*
-	 * Online variance algorithm:
-	 *
-	 *  new_variance = variance + (value - average) x (value - new_average)
-	 *
-	 * Warning: irqs->avg is updated with the line above, hence
-	 * 'interval - irqs->avg' is no longer equal to 'diff'
-	 */
-	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
-
-	/*
-	 * Update the next event
-	 */
-	irqs->next_evt = ts + irqs->avg;
-}
-
 /**
  * irq_timings_next_event - Return when the next event is supposed to arrive
  *
@@ -246,12 +61,6 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
  */
 u64 irq_timings_next_event(u64 now)
 {
-	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
-	struct irqt_stat *irqs;
-	struct irqt_stat __percpu *s;
-	u64 ts, next_evt = U64_MAX;
-	int i, irq = 0;
-
 	/*
 	 * This function must be called with the local irq disabled in
 	 * order to prevent the timings circular buffer to be updated
@@ -259,64 +68,7 @@ u64 irq_timings_next_event(u64 now)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	/*
-	 * Number of elements in the circular buffer: If it happens it
-	 * was flushed before, then the number of elements could be
-	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
-	 * otherwise the array size is used as we wrapped. The index
-	 * begins from zero when we did not wrap. That could be done
-	 * in a nicer way with the proper circular array structure
-	 * type but with the cost of extra computation in the
-	 * interrupt handler hot path. We choose efficiency.
-	 *
-	 * Inject measured irq/timestamp to the statistical model
-	 * while decrementing the counter because we consume the data
-	 * from our circular buffer.
-	 */
-	for (i = irqts->count & IRQ_TIMINGS_MASK,
-		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
-
-		irq = irq_timing_decode(irqts->values[i], &ts);
-
-		s = idr_find(&irqt_stats, irq);
-		if (s) {
-			irqs = this_cpu_ptr(s);
-			irqs_update(irqs, ts);
-		}
-	}
-
-	/*
-	 * Look in the list of interrupts' statistics, the earliest
-	 * next event.
-	 */
-	idr_for_each_entry(&irqt_stats, s, i) {
-
-		irqs = this_cpu_ptr(s);
-
-		if (!irqs->valid)
-			continue;
-
-		if (irqs->next_evt <= now) {
-			irq = i;
-			next_evt = now;
-
-			/*
-			 * This interrupt mustn't use in the future
-			 * until new events occur and update the
-			 * statistics.
-			 */
-			irqs->valid = 0;
-			break;
-		}
-
-		if (irqs->next_evt < next_evt) {
-			irq = i;
-			next_evt = irqs->next_evt;
-		}
-	}
-
-	return next_evt;
+	return 0;
 }
 
 void irq_timings_free(int irq)
-- 
cgit v1.2.3


From bbba0e7c5cdadb47a91edea1d5cd0caadbbb016f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 28 Mar 2019 16:13:36 +0100
Subject: genirq/timings: Add array suffix computation code

The previous approach based on the variance was discarding values from
the timings when they were considered as anomalies as stated by the
normal law statistical model.

However in the interrupt life, there can be multiple anomalies due to the
nature of the device generating the interrupts, and most of the time a
repeating pattern can be observed, that is particulary true for network,
console, MMC or SSD devices.

The variance approach missed the patterns and it was only able to deal with
the interrupt coming in regular intervals, thus reducing considerably the
scope of what is predictable.

In order to find out the repeating patterns, the interrupt intervals are
grouped in a ilog2 basis to create a suite of numbers with small
amplitude. Every group contains an exponential moving average of the values
belonging to the group. The array suffix, a data structure used for string
searching, data compression, etc ..., is built from the suite of numbers
and the suffixes are then searched in this suite.

The tests showed the algorithm is able to find all repeating patterns,
as well as regular interval in less than 1us on x86-i7.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: rjw@rjwysocki.net
Cc: ulf.hansson@linaro.org
Cc: linux-pm@vger.kernel.org
Link: https://lkml.kernel.org/r/20190328151336.5316-2-daniel.lezcano@linaro.org
---
 kernel/irq/timings.c | 462 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 457 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 3cde046a2bc8..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -8,6 +8,8 @@
 #include <linux/interrupt.h>
 #include <linux/idr.h>
 #include <linux/irq.h>
+#include <linux/math64.h>
+#include <linux/log2.h>
 
 #include <trace/events/irq.h>
 
@@ -17,10 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
 
-struct irqt_stat {
-	u64     next_evt;
-};
-
 static DEFINE_IDR(irqt_stats);
 
 void irq_timings_enable(void)
@@ -33,6 +31,410 @@ void irq_timings_disable(void)
 	static_branch_disable(&irq_timing_enabled);
 }
 
+/*
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ *  -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ *	For each interval:
+ *		Compute the index = ilog2(interval)
+ *		Compute a new_ema(buffer[index], interval)
+ *		Store the index in a circular buffer
+ *
+ *	Compute the suffix array of the indexes
+ *
+ *	For each suffix:
+ *		If the suffix is reverse-found 3 times
+ *			Return suffix
+ *
+ *	Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
+ *
+ * Example based on real values:
+ *
+ * Example 1 : MMC write/read interrupt interval:
+ *
+ *	223947, 1240, 1384, 1386, 1386,
+ *	217416, 1236, 1384, 1386, 1387,
+ *	214719, 1241, 1386, 1387, 1384,
+ *	213696, 1234, 1384, 1386, 1388,
+ *	219904, 1240, 1385, 1389, 1385,
+ *	212240, 1240, 1386, 1386, 1386,
+ *	214415, 1236, 1384, 1386, 1387,
+ *	214276, 1234, 1384, 1388, ?
+ *
+ * For each element, apply ilog2(value)
+ *
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, ?
+ *
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
+ * we can be confident if the pattern repeats itself three times it is
+ * a repeating pattern.
+ *
+ *	             8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, ?
+ *
+ * Suffixes are:
+ *
+ *  1) 8, 15, 8, 8, 8  <- max period
+ *  2) 8, 15, 8, 8
+ *  3) 8, 15, 8
+ *  4) 8, 15           <- min period
+ *
+ * From there we search the repeating pattern for each suffix.
+ *
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
+ *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
+ *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
+ *                         8, 15, 8, 8, 8  |   |  |  |  |
+ *                                         8, 15, 8, 8, 8
+ *
+ * When moving the suffix, we found exactly 3 matches.
+ *
+ * The first suffix with period 5 is repeating.
+ *
+ * The next event is (3 * max_period) % suffix_period
+ *
+ * In this example, the result 0, so the next event is suffix[0] => 8
+ *
+ * However, 8 is the index in the array of exponential moving average
+ * which was calculated on the fly when storing the values, so the
+ * interval is ema[8] = 1366
+ *
+ *
+ * Example 2:
+ *
+ *	4, 3, 5, 100,
+ *	3, 3, 5, 117,
+ *	4, 4, 5, 112,
+ *	4, 3, 4, 110,
+ *	3, 5, 3, 117,
+ *	4, 4, 5, 112,
+ *	4, 3, 4, 110,
+ *	3, 4, 5, 112,
+ *	4, 3, 4, 110
+ *
+ * ilog2
+ *
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4
+ *
+ * Max period 5:
+ *	   0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ *  1) 0, 0, 4, 0, 0
+ *  2) 0, 0, 4, 0
+ *  3) 0, 0, 4
+ *  4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  X
+ *         0, 0, 4, 0, 0, |  X
+ *                        0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+ *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
+ *                     0, 0, 4, 0, |  |  |  |  |  |  |
+ *                                 0, 0, 4, 0, |  |  |
+ *                                             0  0  4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL		64
+#define EMA_ALPHA_SHIFT		7
+
+#define PREDICTION_PERIOD_MIN	2
+#define PREDICTION_PERIOD_MAX	5
+#define PREDICTION_FACTOR	4
+#define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */
+
+struct irqt_stat {
+	u64	last_ts;
+	u64	ema_time[PREDICTION_BUFFER_SIZE];
+	int	timings[IRQ_TIMINGS_SIZE];
+	int	circ_timings[IRQ_TIMINGS_SIZE];
+	int	count;
+};
+
+/*
+ * Exponential moving average computation
+ */
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+	s64 diff;
+
+	if (unlikely(!ema_old))
+		return value;
+
+	diff = (value - ema_old) * EMA_ALPHA_VAL;
+	/*
+	 * We can use a s64 type variable to be added with the u64
+	 * ema_old variable as this one will never have its topmost
+	 * bit set, it will be always smaller than 2^63 nanosec
+	 * interrupt interval (292 years).
+	 */
+	return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+	int i;
+
+	/*
+	 * The buffer contains the suite of intervals, in a ilog2
+	 * basis, we are looking for a repetition. We point the
+	 * beginning of the search three times the length of the
+	 * period beginning at the end of the buffer. We do that for
+	 * each suffix.
+	 */
+	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+
+		int *begin = &buffer[len - (i * 3)];
+		int *ptr = begin;
+
+		/*
+		 * We look if the suite with period 'i' repeat
+		 * itself. If it is truncated at the end, as it
+		 * repeats we can use the period to find out the next
+		 * element.
+		 */
+		while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+			ptr += i;
+			if (ptr >= &buffer[len])
+				return begin[((i * 3) % i)];
+		}
+	}
+
+	return -1;
+}
+
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+	int index, i, period_max, count, start, min = INT_MAX;
+
+	if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+		irqs->count = irqs->last_ts = 0;
+		return U64_MAX;
+	}
+
+	/*
+	 * As we want to find three times the repetition, we need a
+	 * number of intervals greater or equal to three times the
+	 * maximum period, otherwise we truncate the max period.
+	 */
+	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+		PREDICTION_PERIOD_MAX : irqs->count / 3;
+
+	/*
+	 * If we don't have enough irq timings for this prediction,
+	 * just bail out.
+	 */
+	if (period_max <= PREDICTION_PERIOD_MIN)
+		return U64_MAX;
+
+	/*
+	 * 'count' will depends if the circular buffer wrapped or not
+	 */
+	count = irqs->count < IRQ_TIMINGS_SIZE ?
+		irqs->count : IRQ_TIMINGS_SIZE;
+
+	start = irqs->count < IRQ_TIMINGS_SIZE ?
+		0 : (irqs->count & IRQ_TIMINGS_MASK);
+
+	/*
+	 * Copy the content of the circular buffer into another buffer
+	 * in order to linearize the buffer instead of dealing with
+	 * wrapping indexes and shifted array which will be prone to
+	 * error and extremelly difficult to debug.
+	 */
+	for (i = 0; i < count; i++) {
+		int index = (start + i) & IRQ_TIMINGS_MASK;
+
+		irqs->timings[i] = irqs->circ_timings[index];
+		min = min_t(int, irqs->timings[i], min);
+	}
+
+	index = irq_timings_next_event_index(irqs->timings, count, period_max);
+	if (index < 0)
+		return irqs->last_ts + irqs->ema_time[min];
+
+	return irqs->last_ts + irqs->ema_time[index];
+}
+
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
+{
+	u64 old_ts = irqs->last_ts;
+	u64 interval;
+	int index;
+
+	/*
+	 * The timestamps are absolute time values, we need to compute
+	 * the timing interval between two interrupts.
+	 */
+	irqs->last_ts = ts;
+
+	/*
+	 * The interval type is u64 in order to deal with the same
+	 * type in our computation, that prevent mindfuck issues with
+	 * overflow, sign and division.
+	 */
+	interval = ts - old_ts;
+
+	/*
+	 * The interrupt triggered more than one second apart, that
+	 * ends the sequence as predictible for our purpose. In this
+	 * case, assume we have the beginning of a sequence and the
+	 * timestamp is the first value. As it is impossible to
+	 * predict anything at this point, return.
+	 *
+	 * Note the first timestamp of the sequence will always fall
+	 * in this test because the old_ts is zero. That is what we
+	 * want as we need another timestamp to compute an interval.
+	 */
+	if (interval >= NSEC_PER_SEC) {
+		irqs->count = 0;
+		return;
+	}
+
+	/*
+	 * Get the index in the ema table for this interrupt. The
+	 * PREDICTION_FACTOR increase the interval size for the array
+	 * of exponential average.
+	 */
+	index = likely(interval) ?
+		ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
+
+	/*
+	 * Store the index as an element of the pattern in another
+	 * circular array.
+	 */
+	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+
+	irqs->ema_time[index] = irq_timings_ema_new(interval,
+						    irqs->ema_time[index]);
+
+	irqs->count++;
+}
+
 /**
  * irq_timings_next_event - Return when the next event is supposed to arrive
  *
@@ -61,6 +463,12 @@ void irq_timings_disable(void)
  */
 u64 irq_timings_next_event(u64 now)
 {
+	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+	struct irqt_stat *irqs;
+	struct irqt_stat __percpu *s;
+	u64 ts, next_evt = U64_MAX;
+	int i, irq = 0;
+
 	/*
 	 * This function must be called with the local irq disabled in
 	 * order to prevent the timings circular buffer to be updated
@@ -68,7 +476,51 @@ u64 irq_timings_next_event(u64 now)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	return 0;
+	if (!irqts->count)
+		return next_evt;
+
+	/*
+	 * Number of elements in the circular buffer: If it happens it
+	 * was flushed before, then the number of elements could be
+	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
+	 * otherwise the array size is used as we wrapped. The index
+	 * begins from zero when we did not wrap. That could be done
+	 * in a nicer way with the proper circular array structure
+	 * type but with the cost of extra computation in the
+	 * interrupt handler hot path. We choose efficiency.
+	 *
+	 * Inject measured irq/timestamp to the pattern prediction
+	 * model while decrementing the counter because we consume the
+	 * data from our circular buffer.
+	 */
+
+	i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+
+	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+		irq = irq_timing_decode(irqts->values[i], &ts);
+		s = idr_find(&irqt_stats, irq);
+		if (s)
+			irq_timings_store(irq, this_cpu_ptr(s), ts);
+	}
+
+	/*
+	 * Look in the list of interrupts' statistics, the earliest
+	 * next event.
+	 */
+	idr_for_each_entry(&irqt_stats, s, i) {
+
+		irqs = this_cpu_ptr(s);
+
+		ts = __irq_timings_next_event(irqs, i, now);
+		if (ts <= now)
+			return now;
+
+		if (ts < next_evt)
+			next_evt = ts;
+	}
+
+	return next_evt;
 }
 
 void irq_timings_free(int irq)
-- 
cgit v1.2.3


From 9eca544b1491df90ea7102a7ed14acc3c562d97b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 28 Mar 2019 11:33:21 +0100
Subject: cpufreq: schedutil: Simplify iowait boosting

There is not reason for the minimum iowait boost value in the
schedutil cpufreq governor to depend on the available range of CPU
frequencies.  In fact, that dependency is generally confusing,
because it causes the iowait boost to behave somewhat differently
on CPUs with the same maximum frequency and different minimum
frequencies, for example.

For this reason, replace the min field in struct sugov_cpu
with a constant and choose its values to be 1/8 of
SCHED_CAPACITY_SCALE (for consistency with the intel_pstate
driver's internal governor).

[Note that policy->cpuinfo.max_freq will not be a constant any more
 after a subsequent change, so this change is depended on by it.]

Link: https://lore.kernel.org/lkml/20190305083202.GU32494@hirez.programming.kicks-ass.net/T/#ee20bdc98b7d89f6110c0d00e5c3ee8c2ced93c3d
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 kernel/sched/cpufreq_schedutil.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5c41ea367422..b3a878aa593d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,8 @@
 #include <linux/sched/cpufreq.h>
 #include <trace/events/power.h>
 
+#define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8)
+
 struct sugov_tunables {
 	struct gov_attr_set	attr_set;
 	unsigned int		rate_limit_us;
@@ -51,7 +53,6 @@ struct sugov_cpu {
 	u64			last_update;
 
 	unsigned long		bw_dl;
-	unsigned long		min;
 	unsigned long		max;
 
 	/* The field below is for single-CPU policies only: */
@@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
  *
  * The IO wait boost of a task is disabled after a tick since the last update
  * of a CPU. If a new IO wait boost is requested after more then a tick, then
- * we enable the boost starting from the minimum frequency, which improves
- * energy efficiency by ignoring sporadic wakeups from IO.
+ * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
+ * efficiency by ignoring sporadic wakeups from IO.
  */
 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 			       bool set_iowait_boost)
@@ -303,7 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 	if (delta_ns <= TICK_NSEC)
 		return false;
 
-	sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
+	sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
 	sg_cpu->iowait_boost_pending = set_iowait_boost;
 
 	return true;
@@ -317,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
  *
  * Each time a task wakes up after an IO operation, the CPU utilization can be
  * boosted to a certain utilization which doubles at each "frequent and
- * successive" wakeup from IO, ranging from the utilization of the minimum
- * OPP to the utilization of the maximum OPP.
+ * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
+ * of the maximum OPP.
+ *
  * To keep doubling, an IO boost has to be requested at least once per tick,
  * otherwise we restart from the utilization of the minimum OPP.
  */
@@ -349,7 +351,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 	}
 
 	/* First wakeup after IO: start with minimum boost */
-	sg_cpu->iowait_boost = sg_cpu->min;
+	sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
 }
 
 /**
@@ -389,7 +391,7 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 		 * No boost pending; reduce the boost value.
 		 */
 		sg_cpu->iowait_boost >>= 1;
-		if (sg_cpu->iowait_boost < sg_cpu->min) {
+		if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
 			sg_cpu->iowait_boost = 0;
 			return util;
 		}
@@ -826,9 +828,6 @@ static int sugov_start(struct cpufreq_policy *policy)
 		memset(sg_cpu, 0, sizeof(*sg_cpu));
 		sg_cpu->cpu			= cpu;
 		sg_cpu->sg_policy		= sg_policy;
-		sg_cpu->min			=
-			(SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
-			policy->cpuinfo.max_freq;
 	}
 
 	for_each_cpu(cpu, policy->cpus) {
-- 
cgit v1.2.3


From d1be6a28b13ce6d1bc42bf9b6a9454c65839225b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 22 Feb 2019 12:48:44 +0000
Subject: asm-generic/mmiowb: Add generic implementation of mmiowb() tracking

In preparation for removing all explicit mmiowb() calls from driver
code, implement a tracking system in asm-generic based loosely on the
PowerPC implementation. This allows architectures with a non-empty
mmiowb() definition to have the barrier automatically inserted in
spin_unlock() following a critical section containing an I/O write.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/Kconfig.locks      | 7 +++++++
 kernel/locking/spinlock.c | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index fbba478ae522..6ba2570eddad 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS
 config QUEUED_RWLOCKS
 	def_bool y if ARCH_USE_QUEUED_RWLOCKS
 	depends on SMP
+
+config ARCH_HAS_MMIOWB
+	bool
+
+config MMIOWB
+	def_bool y if ARCH_HAS_MMIOWB
+	depends on SMP
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 936f3d14dd6b..0ff08380f531 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,13 @@
 #include <linux/debug_locks.h>
 #include <linux/export.h>
 
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
+
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
  * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
-- 
cgit v1.2.3


From 60ca1e5a200cd294a12907fa36dece4241db4ab8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 22 Feb 2019 12:59:59 +0000
Subject: mmiowb: Hook up mmiowb helpers to spinlocks and generic I/O accessors

Removing explicit calls to mmiowb() from driver code means that we must
now call into the generic mmiowb_spin_{lock,unlock}() functions from the
core spinlock code. In order to elide barriers following critical
sections without any I/O writes, we also hook into the asm-generic I/O
routines.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/locking/spinlock_debug.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..399669f7eba8 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
 {
 	debug_spin_lock_before(lock);
 	arch_spin_lock(&lock->raw_lock);
+	mmiowb_spin_lock();
 	debug_spin_lock_after(lock);
 }
 
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
 {
 	int ret = arch_spin_trylock(&lock->raw_lock);
 
-	if (ret)
+	if (ret) {
+		mmiowb_spin_lock();
 		debug_spin_lock_after(lock);
+	}
 #ifndef CONFIG_SMP
 	/*
 	 * Must not happen on UP:
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
 
 void do_raw_spin_unlock(raw_spinlock_t *lock)
 {
+	mmiowb_spin_unlock();
 	debug_spin_unlock(lock);
 	arch_spin_unlock(&lock->raw_lock);
 }
-- 
cgit v1.2.3


From 12f2639038ef420fe796171ffb810b30d1ac0619 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 5 Apr 2019 21:46:12 +0200
Subject: tracing: stop making gpio tracing configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gpio tracing was made configurable in 4.4-rc1 (commit ddd70280bf0e
("tracing: gpio: Add Kconfig option for enabling/disabling trace
events")). Since then it is the only event type that can be compiled
conditionally. Given that there is only little overhead I don't
understand the reasoning and I was annoyed more than once that gpio
events were not available without recompiling.

So drop the Kconfig symbol and make gpio events available
unconditionally.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 kernel/trace/Kconfig | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8bd1d6d001d7..5d965cef6c77 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -774,13 +774,6 @@ config TRACE_EVAL_MAP_FILE
 
 	If unsure, say N
 
-config TRACING_EVENTS_GPIO
-	bool "Trace gpio events"
-	depends on GPIOLIB
-	default y
-	help
-	  Enable tracing events for gpio subsystem
-
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
-- 
cgit v1.2.3


From 1e144d73f7295f766568c357448a11eb12868e29 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 1 Apr 2019 16:07:48 -0400
Subject: tracing: Add trace_array parameter to create_event_filter()

Pass in the trace_array that represents the instance the filter being
changed is in to create_event_filter(). This will allow for error messages
that happen when writing to the filter can be displayed in the proper
instance "error_log" file.

Note, for calls to create_filter() (that was also modified to support
create_event_filter()), that changes filters that do not exist in a instance
(for perf for example), NULL may be passed in, which means that there will
not be any message to log for that filter.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h                |  3 ++-
 kernel/trace/trace_events_filter.c  | 25 ++++++++++++++-----------
 kernel/trace/trace_events_trigger.c |  3 ++-
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b711edbef7e7..809c5d7f0064 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1553,7 +1553,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
 extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct trace_event_call *call,
+extern int create_event_filter(struct trace_array *tr,
+			       struct trace_event_call *call,
 			       char *filter_str, bool set_str,
 			       struct event_filter **filterp);
 extern void free_event_filter(struct event_filter *filter);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 290d42c59101..2b63930cd3e6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -920,7 +920,8 @@ static void remove_filter_string(struct event_filter *filter)
 	filter->filter_string = NULL;
 }
 
-static void append_filter_err(struct filter_parse_error *pe,
+static void append_filter_err(struct trace_array *tr,
+			      struct filter_parse_error *pe,
 			      struct event_filter *filter)
 {
 	struct trace_seq *s;
@@ -1607,7 +1608,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
 		if (err) {
 			filter_disable(file);
 			parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			append_filter_err(pe, filter);
+			append_filter_err(tr, pe, filter);
 		} else
 			event_set_filtered_flag(file);
 
@@ -1719,7 +1720,8 @@ static void create_filter_finish(struct filter_parse_error *pe)
  * information if @set_str is %true and the caller is responsible for
  * freeing it.
  */
-static int create_filter(struct trace_event_call *call,
+static int create_filter(struct trace_array *tr,
+			 struct trace_event_call *call,
 			 char *filter_string, bool set_str,
 			 struct event_filter **filterp)
 {
@@ -1736,17 +1738,18 @@ static int create_filter(struct trace_event_call *call,
 
 	err = process_preds(call, filter_string, *filterp, pe);
 	if (err && set_str)
-		append_filter_err(pe, *filterp);
+		append_filter_err(tr, pe, *filterp);
 	create_filter_finish(pe);
 
 	return err;
 }
 
-int create_event_filter(struct trace_event_call *call,
+int create_event_filter(struct trace_array *tr,
+			struct trace_event_call *call,
 			char *filter_str, bool set_str,
 			struct event_filter **filterp)
 {
-	return create_filter(call, filter_str, set_str, filterp);
+	return create_filter(tr, call, filter_str, set_str, filterp);
 }
 
 /**
@@ -1773,7 +1776,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 			kfree((*filterp)->filter_string);
 			(*filterp)->filter_string = NULL;
 		} else {
-			append_filter_err(pe, *filterp);
+			append_filter_err(tr, pe, *filterp);
 		}
 	}
 	create_filter_finish(pe);
@@ -1804,7 +1807,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
 		return 0;
 	}
 
-	err = create_filter(call, filter_string, true, &filter);
+	err = create_filter(file->tr, call, filter_string, true, &filter);
 
 	/*
 	 * Always swap the call filter with the new filter
@@ -2060,7 +2063,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (event->filter)
 		goto out_unlock;
 
-	err = create_filter(call, filter_str, false, &filter);
+	err = create_filter(NULL, call, filter_str, false, &filter);
 	if (err)
 		goto free_filter;
 
@@ -2209,8 +2212,8 @@ static __init int ftrace_test_event_filter(void)
 		struct test_filter_data_t *d = &test_filter_data[i];
 		int err;
 
-		err = create_filter(&event_ftrace_test_filter, d->filter,
-				    false, &filter);
+		err = create_filter(NULL, &event_ftrace_test_filter,
+				    d->filter, false, &filter);
 		if (err) {
 			printk(KERN_INFO
 			       "Failed to get filter for '%s', err %d\n",
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cd12ecb66eb9..2a2912cb4533 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str,
 		goto out;
 
 	/* The filter is for the 'trigger' event, not the triggered event */
-	ret = create_event_filter(file->event_call, filter_str, false, &filter);
+	ret = create_event_filter(file->tr, file->event_call,
+				  filter_str, false, &filter);
 	/*
 	 * If create_event_filter() fails, filter still needs to be freed.
 	 * Which the calling code will do with data->filter.
-- 
cgit v1.2.3


From d0cd871ba0d613e09366e4b6a17946dfcf51db7c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 1 Apr 2019 22:30:22 -0400
Subject: tracing: Have histogram code pass around trace_array for error
 handling

Have the trace_array that associates the trace instance of the histogram
passed around to functions so that error handling can display the error
message in the proper instance.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 142 ++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 071c62cacba7..a167e439e9a1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -619,7 +619,7 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
 }
 
-static void hist_err(u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
 {
 	tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos);
 }
@@ -1756,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,
 
 		if (find_var_field(var_hist_data, var_name)) {
 			if (found) {
-				hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
+				hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
 				return NULL;
 			}
 
@@ -1807,7 +1807,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
 			hist_field = find_file_var(file, var_name);
 			if (hist_field) {
 				if (found) {
-					hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
+					hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE,
+						 errpos(var_name));
 					return ERR_PTR(-EINVAL);
 				}
 
@@ -2042,7 +2043,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
 	return ret;
 }
 
-static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+static int parse_assignment(struct trace_array *tr,
+			    char *str, struct hist_trigger_attrs *attrs)
 {
 	int ret = 0;
 
@@ -2098,7 +2100,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 		char *assignment;
 
 		if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
-			hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str));
+			hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -2115,7 +2117,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
 	return ret;
 }
 
-static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+static struct hist_trigger_attrs *
+parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
 {
 	struct hist_trigger_attrs *attrs;
 	int ret = 0;
@@ -2128,7 +2131,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		char *str = strsep(&trigger_str, ":");
 
 		if (strchr(str, '=')) {
-			ret = parse_assignment(str, attrs);
+			ret = parse_assignment(tr, str, attrs);
 			if (ret)
 				goto free;
 		} else if (strcmp(str, "pause") == 0)
@@ -2684,6 +2687,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
 					char *var_name)
 {
 	struct hist_field *var_field = NULL, *ref_field = NULL;
+	struct trace_array *tr = hist_data->event_file->tr;
 
 	if (!is_var_ref(var_name))
 		return NULL;
@@ -2696,7 +2700,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
 					   system, event_name);
 
 	if (!ref_field)
-		hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
+		hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
 
 	return ref_field;
 }
@@ -2707,6 +2711,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 {
 	struct ftrace_event_field *field = NULL;
 	char *field_name, *modifier, *str;
+	struct trace_array *tr = file->tr;
 
 	modifier = str = kstrdup(field_str, GFP_KERNEL);
 	if (!modifier)
@@ -2730,7 +2735,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 		else if (strcmp(modifier, "usecs") == 0)
 			*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
 		else {
-			hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
+			hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
@@ -2746,7 +2751,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 	else {
 		field = trace_find_event_field(file->event_call, field_name);
 		if (!field || !field->size) {
-			hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
+			hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
@@ -2808,7 +2813,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
 
 	s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
 	if (!s) {
-		hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+		hist_field = parse_var_ref(hist_data, ref_system,
+					   ref_event, ref_var);
 		if (hist_field) {
 			if (var_name) {
 				hist_field = create_alias(hist_data, hist_field, var_name);
@@ -2857,7 +2863,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 	/* we support only -(xxx) i.e. explicit parens required */
 
 	if (level > 3) {
-		hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
+		hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
 		ret = -EINVAL;
 		goto free;
 	}
@@ -2912,7 +2918,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 	return ERR_PTR(ret);
 }
 
-static int check_expr_operands(struct hist_field *operand1,
+static int check_expr_operands(struct trace_array *tr,
+			       struct hist_field *operand1,
 			       struct hist_field *operand2)
 {
 	unsigned long operand1_flags = operand1->flags;
@@ -2940,7 +2947,7 @@ static int check_expr_operands(struct hist_field *operand1,
 
 	if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
 	    (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
-		hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0);
+		hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0);
 		return -EINVAL;
 	}
 
@@ -2958,7 +2965,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 	char *sep, *operand1_str;
 
 	if (level > 3) {
-		hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
+		hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3003,7 +3010,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 		goto free;
 	}
 
-	ret = check_expr_operands(operand1, operand2);
+	ret = check_expr_operands(file->tr, operand1, operand2);
 	if (ret)
 		goto free;
 
@@ -3196,14 +3203,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	int ret;
 
 	if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
-		hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
+		hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
 	file = event_file(tr, subsys_name, event_name);
 
 	if (IS_ERR(file)) {
-		hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
+		hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
 		ret = PTR_ERR(file);
 		return ERR_PTR(ret);
 	}
@@ -3216,7 +3223,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	 */
 	hist_data = find_compatible_hist(target_hist_data, file);
 	if (!hist_data) {
-		hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
+		hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3277,7 +3284,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 		kfree(cmd);
 		kfree(var_hist->cmd);
 		kfree(var_hist);
-		hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
+		hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
 		return ERR_PTR(ret);
 	}
 
@@ -3289,7 +3296,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	if (IS_ERR_OR_NULL(event_var)) {
 		kfree(var_hist->cmd);
 		kfree(var_hist);
-		hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
+		hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -3422,25 +3429,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
 {
 	struct hist_field *val = NULL, *var = NULL;
 	unsigned long flags = HIST_FIELD_FL_VAR;
+	struct trace_array *tr = file->tr;
 	struct field_var *field_var;
 	int ret = 0;
 
 	if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
-		hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
+		hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
 		ret = -EINVAL;
 		goto err;
 	}
 
 	val = parse_atom(hist_data, file, field_name, &flags, NULL);
 	if (IS_ERR(val)) {
-		hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
+		hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
 		ret = PTR_ERR(val);
 		goto err;
 	}
 
 	var = create_var(hist_data, file, field_name, val->size, val->type);
 	if (IS_ERR(var)) {
-		hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
+		hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
 		kfree(val);
 		ret = PTR_ERR(var);
 		goto err;
@@ -3767,19 +3775,20 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 {
 	struct hist_field *var_field, *ref_field, *track_var = NULL;
 	struct trace_event_file *file = hist_data->event_file;
+	struct trace_array *tr = file->tr;
 	char *track_data_var_str;
 	int ret = 0;
 
 	track_data_var_str = data->track_data.var_str;
 	if (track_data_var_str[0] != '$') {
-		hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
+		hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
 		return -EINVAL;
 	}
 	track_data_var_str++;
 
 	var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
 	if (!var_field) {
-		hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
+		hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
 		return -EINVAL;
 	}
 
@@ -3792,7 +3801,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 	if (data->handler == HANDLER_ONMAX)
 		track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
 	if (IS_ERR(track_var)) {
-		hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
+		hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
 		ret = PTR_ERR(track_var);
 		goto out;
 	}
@@ -3800,7 +3809,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 	if (data->handler == HANDLER_ONCHANGE)
 		track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
 	if (IS_ERR(track_var)) {
-		hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
+		hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
 		ret = PTR_ERR(track_var);
 		goto out;
 	}
@@ -3811,7 +3820,8 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 	return ret;
 }
 
-static int parse_action_params(char *params, struct action_data *data)
+static int parse_action_params(struct trace_array *tr, char *params,
+			       struct action_data *data)
 {
 	char *param, *saved_param;
 	bool first_param = true;
@@ -3819,20 +3829,20 @@ static int parse_action_params(char *params, struct action_data *data)
 
 	while (params) {
 		if (data->n_params >= SYNTH_FIELDS_MAX) {
-			hist_err(HIST_ERR_TOO_MANY_PARAMS, 0);
+			hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
 			goto out;
 		}
 
 		param = strsep(&params, ",");
 		if (!param) {
-			hist_err(HIST_ERR_PARAM_NOT_FOUND, 0);
+			hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0);
 			ret = -EINVAL;
 			goto out;
 		}
 
 		param = strstrip(param);
 		if (strlen(param) < 2) {
-			hist_err(HIST_ERR_INVALID_PARAM, errpos(param));
+			hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3856,7 +3866,7 @@ static int parse_action_params(char *params, struct action_data *data)
 	return ret;
 }
 
-static int action_parse(char *str, struct action_data *data,
+static int action_parse(struct trace_array *tr, char *str, struct action_data *data,
 			enum handler_id handler)
 {
 	char *action_name;
@@ -3864,14 +3874,14 @@ static int action_parse(char *str, struct action_data *data,
 
 	strsep(&str, ".");
 	if (!str) {
-		hist_err(HIST_ERR_ACTION_NOT_FOUND, 0);
+		hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	action_name = strsep(&str, "(");
 	if (!action_name || !str) {
-		hist_err(HIST_ERR_ACTION_NOT_FOUND, 0);
+		hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3880,12 +3890,12 @@ static int action_parse(char *str, struct action_data *data,
 		char *params = strsep(&str, ")");
 
 		if (!params) {
-			hist_err(HIST_ERR_NO_SAVE_PARAMS, 0);
+			hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0);
 			ret = -EINVAL;
 			goto out;
 		}
 
-		ret = parse_action_params(params, data);
+		ret = parse_action_params(tr, params, data);
 		if (ret)
 			goto out;
 
@@ -3894,7 +3904,7 @@ static int action_parse(char *str, struct action_data *data,
 		else if (handler == HANDLER_ONCHANGE)
 			data->track_data.check_val = check_track_val_changed;
 		else {
-			hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name));
+			hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3906,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data,
 		char *params = strsep(&str, ")");
 
 		if (!str) {
-			hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params));
+			hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3916,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data,
 		else if (handler == HANDLER_ONCHANGE)
 			data->track_data.check_val = check_track_val_changed;
 		else {
-			hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name));
+			hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
 			ret = -EINVAL;
 			goto out;
 		}
@@ -3931,7 +3941,7 @@ static int action_parse(char *str, struct action_data *data,
 			data->use_trace_keyword = true;
 
 		if (params) {
-			ret = parse_action_params(params, data);
+			ret = parse_action_params(tr, params, data);
 			if (ret)
 				goto out;
 		}
@@ -3984,7 +3994,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
 		goto free;
 	}
 
-	ret = action_parse(str, data, handler);
+	ret = action_parse(hist_data->event_file->tr, str, data, handler);
 	if (ret)
 		goto free;
  out:
@@ -4054,6 +4064,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
 		      struct action_data *data,
 		      char *system, char *event, char *var)
 {
+	struct trace_array *tr = hist_data->event_file->tr;
 	struct hist_field *hist_field;
 
 	var++; /* skip '$' */
@@ -4069,7 +4080,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
 	}
 
 	if (!hist_field)
-		hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var));
+		hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var));
 
 	return hist_field;
 }
@@ -4127,6 +4138,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
 static int trace_action_create(struct hist_trigger_data *hist_data,
 			       struct action_data *data)
 {
+	struct trace_array *tr = hist_data->event_file->tr;
 	char *event_name, *param, *system = NULL;
 	struct hist_field *hist_field, *var_ref;
 	unsigned int i, var_ref_idx;
@@ -4144,7 +4156,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
 
 	event = find_synth_event(synth_event_name);
 	if (!event) {
-		hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
+		hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
 		return -EINVAL;
 	}
 
@@ -4205,14 +4217,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
 			continue;
 		}
 
-		hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
+		hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
 		kfree(p);
 		ret = -EINVAL;
 		goto err;
 	}
 
 	if (field_pos != event->n_fields) {
-		hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
+		hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
 		ret = -EINVAL;
 		goto err;
 	}
@@ -4231,6 +4243,7 @@ static int action_create(struct hist_trigger_data *hist_data,
 			 struct action_data *data)
 {
 	struct trace_event_file *file = hist_data->event_file;
+	struct trace_array *tr = file->tr;
 	struct track_data *track_data;
 	struct field_var *field_var;
 	unsigned int i;
@@ -4258,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data,
 	if (data->action == ACTION_SAVE) {
 		if (hist_data->n_save_vars) {
 			ret = -EEXIST;
-			hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
+			hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
 			goto out;
 		}
 
@@ -4271,7 +4284,8 @@ static int action_create(struct hist_trigger_data *hist_data,
 
 			field_var = create_target_field_var(hist_data, NULL, NULL, param);
 			if (IS_ERR(field_var)) {
-				hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param));
+				hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL,
+					 errpos(param));
 				ret = PTR_ERR(field_var);
 				kfree(param);
 				goto out;
@@ -4305,18 +4319,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
 
 	match_event = strsep(&str, ")");
 	if (!match_event || !str) {
-		hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
+		hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
 		goto free;
 	}
 
 	match_event_system = strsep(&match_event, ".");
 	if (!match_event) {
-		hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
+		hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
 		goto free;
 	}
 
 	if (IS_ERR(event_file(tr, match_event_system, match_event))) {
-		hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
+		hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
 		goto free;
 	}
 
@@ -4332,7 +4346,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
 		goto free;
 	}
 
-	ret = action_parse(str, data, HANDLER_ONMATCH);
+	ret = action_parse(tr, str, data, HANDLER_ONMATCH);
 	if (ret)
 		goto free;
  out:
@@ -4401,13 +4415,14 @@ static int create_var_field(struct hist_trigger_data *hist_data,
 			    struct trace_event_file *file,
 			    char *var_name, char *expr_str)
 {
+	struct trace_array *tr = hist_data->event_file->tr;
 	unsigned long flags = 0;
 
 	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
 		return -EINVAL;
 
 	if (find_var(hist_data, file, var_name) && !hist_data->remove) {
-		hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name));
+		hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name));
 		return -EINVAL;
 	}
 
@@ -4464,8 +4479,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			    struct trace_event_file *file,
 			    char *field_str)
 {
+	struct trace_array *tr = hist_data->event_file->tr;
 	struct hist_field *hist_field = NULL;
-
 	unsigned long flags = 0;
 	unsigned int key_size;
 	int ret = 0;
@@ -4488,7 +4503,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		}
 
 		if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
-			hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str));
+			hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
 			destroy_hist_field(hist_field, 0);
 			ret = -EINVAL;
 			goto out;
@@ -4589,6 +4604,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data)
 
 static int parse_var_defs(struct hist_trigger_data *hist_data)
 {
+	struct trace_array *tr = hist_data->event_file->tr;
 	char *s, *str, *var_name, *field_str;
 	unsigned int i, j, n_vars = 0;
 	int ret = 0;
@@ -4602,13 +4618,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
 
 			var_name = strsep(&field_str, "=");
 			if (!var_name || !field_str) {
-				hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name));
+				hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT,
+					 errpos(var_name));
 				ret = -EINVAL;
 				goto free;
 			}
 
 			if (n_vars == TRACING_MAP_VARS_MAX) {
-				hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name));
+				hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name));
 				ret = -EINVAL;
 				goto free;
 			}
@@ -5829,6 +5846,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 {
 	struct hist_trigger_data *hist_data = data->private_data;
 	struct event_trigger_data *test, *named_data = NULL;
+	struct trace_array *tr = file->tr;
 	int ret = 0;
 
 	if (hist_data->attrs->name) {
@@ -5836,7 +5854,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		if (named_data) {
 			if (!hist_trigger_match(data, named_data, named_data,
 						true)) {
-				hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
+				hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
 				ret = -EINVAL;
 				goto out;
 			}
@@ -5857,7 +5875,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 			else if (hist_data->attrs->clear)
 				hist_clear(test);
 			else {
-				hist_err(HIST_ERR_TRIGGER_EEXIST, 0);
+				hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
 				ret = -EEXIST;
 			}
 			goto out;
@@ -5865,7 +5883,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	}
  new:
 	if (hist_data->attrs->cont || hist_data->attrs->clear) {
-		hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
+		hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
 		ret = -ENOENT;
 		goto out;
 	}
@@ -5890,7 +5908,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 
 		ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
 		if (ret) {
-			hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
+			hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
 			goto out;
 		}
 
@@ -6108,7 +6126,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 		trigger = strstrip(trigger);
 	}
 
-	attrs = parse_hist_trigger_attrs(trigger);
+	attrs = parse_hist_trigger_attrs(file->tr, trigger);
 	if (IS_ERR(attrs))
 		return PTR_ERR(attrs);
 
-- 
cgit v1.2.3


From 2f754e771b1a6feba670782e82c45555984ac43b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 1 Apr 2019 22:52:21 -0400
Subject: tracing: Have the error logs show up in the proper instances

As each instance has their own error_log file, it makes more sense that the
instances show the errors of their own instead of all error_logs having the
same data. Make it that the errors show up in the instance error_log file
that the error happens in. If no instance trace_array is available, then
NULL can be passed in which will create the error in the top level instance
(the one at the top of the tracefs directory).

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c               | 55 +++++++++++++++++++++++++-------------
 kernel/trace/trace.h               |  5 +++-
 kernel/trace/trace_events_filter.c |  4 +--
 kernel/trace/trace_events_hist.c   |  3 ++-
 kernel/trace/trace_probe.c         |  2 +-
 5 files changed, 46 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7978168f5041..3d55e9daae8c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6897,25 +6897,22 @@ struct tracing_log_err {
 	char			cmd[MAX_FILTER_STR_VAL]; /* what caused err */
 };
 
-static LIST_HEAD(tracing_err_log);
 static DEFINE_MUTEX(tracing_err_log_lock);
 
-static unsigned int n_tracing_err_log_entries;
-
-struct tracing_log_err *get_tracing_log_err(void)
+struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
 {
 	struct tracing_log_err *err;
 
-	if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) {
+	if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
 		err = kzalloc(sizeof(*err), GFP_KERNEL);
 		if (!err)
 			err = ERR_PTR(-ENOMEM);
-		n_tracing_err_log_entries++;
+		tr->n_err_log_entries++;
 
 		return err;
 	}
 
-	err = list_first_entry(&tracing_err_log, struct tracing_log_err, list);
+	err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
 	list_del(&err->list);
 
 	return err;
@@ -6949,6 +6946,7 @@ unsigned int err_pos(char *cmd, const char *str)
 
 /**
  * tracing_log_err - write an error to the tracing error log
+ * @tr: The associated trace array for the error (NULL for top level array)
  * @loc: A string describing where the error occurred
  * @cmd: The tracing command that caused the error
  * @errs: The array of loc-specific static error strings
@@ -6973,13 +6971,17 @@ unsigned int err_pos(char *cmd, const char *str)
  * existing callers for examples of how static strings are typically
  * defined for use with tracing_log_err().
  */
-void tracing_log_err(const char *loc, const char *cmd,
+void tracing_log_err(struct trace_array *tr,
+		     const char *loc, const char *cmd,
 		     const char **errs, u8 type, u8 pos)
 {
 	struct tracing_log_err *err;
 
+	if (!tr)
+		tr = &global_trace;
+
 	mutex_lock(&tracing_err_log_lock);
-	err = get_tracing_log_err();
+	err = get_tracing_log_err(tr);
 	if (PTR_ERR(err) == -ENOMEM) {
 		mutex_unlock(&tracing_err_log_lock);
 		return;
@@ -6993,34 +6995,38 @@ void tracing_log_err(const char *loc, const char *cmd,
 	err->info.pos = pos;
 	err->info.ts = local_clock();
 
-	list_add_tail(&err->list, &tracing_err_log);
+	list_add_tail(&err->list, &tr->err_log);
 	mutex_unlock(&tracing_err_log_lock);
 }
 
-static void clear_tracing_err_log(void)
+static void clear_tracing_err_log(struct trace_array *tr)
 {
 	struct tracing_log_err *err, *next;
 
 	mutex_lock(&tracing_err_log_lock);
-	list_for_each_entry_safe(err, next, &tracing_err_log, list) {
+	list_for_each_entry_safe(err, next, &tr->err_log, list) {
 		list_del(&err->list);
 		kfree(err);
 	}
 
-	n_tracing_err_log_entries = 0;
+	tr->n_err_log_entries = 0;
 	mutex_unlock(&tracing_err_log_lock);
 }
 
 static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
 {
+	struct trace_array *tr = m->private;
+
 	mutex_lock(&tracing_err_log_lock);
 
-	return seq_list_start(&tracing_err_log, *pos);
+	return seq_list_start(&tr->err_log, *pos);
 }
 
 static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	return seq_list_next(v, &tracing_err_log, pos);
+	struct trace_array *tr = m->private;
+
+	return seq_list_next(v, &tr->err_log, pos);
 }
 
 static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
@@ -7067,15 +7073,25 @@ static const struct seq_operations tracing_err_log_seq_ops = {
 
 static int tracing_err_log_open(struct inode *inode, struct file *file)
 {
+	struct trace_array *tr = inode->i_private;
 	int ret = 0;
 
+	if (trace_array_get(tr) < 0)
+		return -ENODEV;
+
 	/* If this file was opened for write, then erase contents */
 	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
-		clear_tracing_err_log();
+		clear_tracing_err_log(tr);
 
-	if (file->f_mode & FMODE_READ)
+	if (file->f_mode & FMODE_READ) {
 		ret = seq_open(file, &tracing_err_log_seq_ops);
-
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = tr;
+		} else {
+			trace_array_put(tr);
+		}
+	}
 	return ret;
 }
 
@@ -7091,6 +7107,7 @@ static const struct file_operations tracing_err_log_fops = {
 	.write		= tracing_err_log_write,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
+	.release	= tracing_release_generic_tr,
 };
 
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
@@ -8293,6 +8310,7 @@ struct trace_array *trace_array_create(const char *name)
 	INIT_LIST_HEAD(&tr->systems);
 	INIT_LIST_HEAD(&tr->events);
 	INIT_LIST_HEAD(&tr->hist_vars);
+	INIT_LIST_HEAD(&tr->err_log);
 
 	if (allocate_trace_buffers(tr, trace_buf_size) < 0)
 		goto out_free_tr;
@@ -9087,6 +9105,7 @@ __init static int tracer_alloc_buffers(void)
 	INIT_LIST_HEAD(&global_trace.systems);
 	INIT_LIST_HEAD(&global_trace.events);
 	INIT_LIST_HEAD(&global_trace.hist_vars);
+	INIT_LIST_HEAD(&global_trace.err_log);
 	list_add(&global_trace.list, &ftrace_trace_arrays);
 
 	apply_trace_boot_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 809c5d7f0064..da00a3d508c1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -293,11 +293,13 @@ struct trace_array {
 	int			nr_topts;
 	bool			clear_trace;
 	int			buffer_percent;
+	unsigned int		n_err_log_entries;
 	struct tracer		*current_trace;
 	unsigned int		trace_flags;
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
 	unsigned int		flags;
 	raw_spinlock_t		start_lock;
+	struct list_head	err_log;
 	struct dentry		*dir;
 	struct dentry		*options;
 	struct dentry		*percpu_dir;
@@ -1886,7 +1888,8 @@ extern ssize_t trace_parse_run_command(struct file *file,
 		int (*createfn)(int, char**));
 
 extern unsigned int err_pos(char *cmd, const char *str);
-extern void tracing_log_err(const char *loc, const char *cmd,
+extern void tracing_log_err(struct trace_array *tr,
+			    const char *loc, const char *cmd,
 			    const char **errs, u8 type, u8 pos);
 
 /*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2b63930cd3e6..180ecb390baa 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -949,12 +949,12 @@ static void append_filter_err(struct trace_array *tr,
 	if (pe->lasterr > 0) {
 		trace_seq_printf(s, "\n%*s", pos, "^");
 		trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
-		tracing_log_err("event filter parse error",
+		tracing_log_err(tr, "event filter parse error",
 				filter->filter_string, err_text,
 				pe->lasterr, pe->lasterr_pos);
 	} else {
 		trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
-		tracing_log_err("event filter parse error",
+		tracing_log_err(tr, "event filter parse error",
 				filter->filter_string, err_text,
 				FILT_ERR_ERRNO, 0);
 	}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index a167e439e9a1..a1136e043f17 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -621,7 +621,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 
 static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
 {
-	tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos);
+	tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
+			err_type, err_pos);
 }
 
 static void hist_err_clear(void)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e11f98c49d72..4cc2d467d34c 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -186,7 +186,7 @@ void __trace_probe_log_err(int offset, int err_type)
 	}
 	*(p - 1) = '\0';
 
-	tracing_log_err(trace_probe_log.subsystem, command,
+	tracing_log_err(NULL, trace_probe_log.subsystem, command,
 			trace_probe_err_text, err_type, pos + offset);
 
 	kfree(command);
-- 
cgit v1.2.3


From a8d655792a32312f6715ac789b860fee50168106 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 31 Mar 2019 18:48:25 -0500
Subject: tracing: Add error_log to README

Add brief blurb about error_log to the 'Important files' section.

Link: http://lkml.kernel.org/r/c81e60f9aded495081231a32d2d1023c4d043a7a.1554072478.git.tom.zanussi@linux.intel.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3d55e9daae8c..2bc18de7f0dc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4702,6 +4702,7 @@ static const char readme_msg[] =
 	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
 	"  current_tracer\t- function and latency tracers\n"
 	"  available_tracers\t- list of configured tracers for current_tracer\n"
+	"  error_log\t- error log for failed commands (that support it)\n"
 	"  buffer_size_kb\t- view and modify size of per cpu buffer\n"
 	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
 	"  trace_clock\t\t-change the clock used to order events\n"
-- 
cgit v1.2.3


From 4f5fbd78a7b40bab538ae0d316363530da751e42 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 26 Mar 2019 20:13:11 +0800
Subject: rcu: validate arguments for rcu tracepoints

When CONFIG_RCU_TRACE is not set, all these tracepoints are defined as
do-nothing macro.
We'd better make those inline functions that take proper arguments.

As RCU_TRACE() is defined as do-nothing marco as well when
CONFIG_RCU_TRACE is not set, so we can clean it up.

Link: http://lkml.kernel.org/r/1553602391-11926-4-git-send-email-laoar.shao@gmail.com

Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/rcu/rcu.h  | 9 ++-------
 kernel/rcu/tree.c | 8 ++++----
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..442ace406ac9 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -11,11 +11,6 @@
 #define __LINUX_RCU_H
 
 #include <trace/events/rcu.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
 
 /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
 #define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
@@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 
 	rcu_lock_acquire(&rcu_callback_map);
 	if (__is_kfree_rcu_offset(offset)) {
-		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
+		trace_rcu_invoke_kfree_callback(rn, head, offset);
 		kfree((void *)head - offset);
 		rcu_lock_release(&rcu_callback_map);
 		return true;
 	} else {
-		RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
+		trace_rcu_invoke_callback(rn, head);
 		f = head->func;
 		WRITE_ONCE(head->func, (rcu_callback_t)0L);
 		f(head);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..906563a1cdea 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2352,14 +2352,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
  */
 int rcutree_dying_cpu(unsigned int cpu)
 {
-	RCU_TRACE(bool blkd;)
-	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);)
-	RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
+	bool blkd;
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+	struct rcu_node *rnp = rdp->mynode;
 
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
 		return 0;
 
-	RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);)
+	blkd = !!(rnp->qsmask & rdp->grpmask);
 	trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
 			       blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
 	return 0;
-- 
cgit v1.2.3


From c13edf8106f6ad1edb9b7e011351fbaf83ceb992 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Leger?= <cleger@kalray.eu>
Date: Wed, 27 Mar 2019 14:06:27 +0100
Subject: dma: select GENERIC_ALLOCATOR for DMA_REMAP

When DMA_REMAP is enabled, code in remap.c needs generic allocator.
It currently worked since few architectures uses it (arm64, csky) and
they both select GENERIC_ALLOCATOR. Select it when using DMA_REMAP
to have correct dependencies.

Signed-off-by: Clement Leger <clement.leger@kalray.eu>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index a06ba3013b3b..52b704e2b97a 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -57,6 +57,7 @@ config SWIOTLB
 
 config DMA_REMAP
 	depends on MMU
+	select GENERIC_ALLOCATOR
 	bool
 
 config DMA_DIRECT_REMAP
-- 
cgit v1.2.3


From d7e02a931235de0779d44c6f8d211df0eca304b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 13 Mar 2019 18:45:21 +0100
Subject: dma-mapping: remove leftover NULL device support

Most dma_map_ops implementations already had some issues with a NULL
device, or did simply crash if one was fed to them.  Now that we have
cleaned up all the obvious offenders we can stop to pretend we
support this mode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index fcdb23e8d2fc..2c2772e9702a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
 		size_t size)
 {
 	return swiotlb_force != SWIOTLB_FORCE &&
-		(!dev || dma_capable(dev, dma_addr, size));
+		dma_capable(dev, dma_addr, size);
 }
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-- 
cgit v1.2.3


From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Dec 2018 14:32:02 +0100
Subject: x86/dma: Remove the x86_dma_fallback_dev hack

Now that we removed support for the NULL device argument in the DMA API,
there is no need to cater for that in the x86 code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/mapping.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index c000906348c9..685a53f2a793 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev)	(true)
-#endif
-
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs)
 {
@@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (!arch_dma_alloc_attrs(&dev))
-		return NULL;
-
 	if (dma_is_direct(ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
-- 
cgit v1.2.3


From 24acfb71822566e4d469b4992a7b3b9f873e0083 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Mar 2019 17:55:47 +0100
Subject: workqueue: Use normal rcu

There is no need for sched_rcu. The undocumented reason why sched_rcu
is used is to avoid a few explicit rcu_read_lock()/unlock() pairs by
the fact that sched_rcu reader side critical sections are also protected
by preempt or irq disabled regions.

Replace rcu_read_lock_sched with rcu_read_lock and acquire the RCU lock
where it is not yet explicit acquired. Replace local_irq_disable() with
rcu_read_lock(). Update asserts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: mangle changelog a little]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 93 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 51 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 21721faa923c..37a32884986b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -127,16 +127,16 @@ enum {
  *
  * PL: wq_pool_mutex protected.
  *
- * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
+ * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
  *
  * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
  *
  * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
- *      sched-RCU for reads.
+ *      RCU for reads.
  *
  * WQ: wq->mutex protected.
  *
- * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
+ * WR: wq->mutex protected for writes.  RCU protected for reads.
  *
  * MD: wq_mayday_lock protected.
  */
@@ -183,7 +183,7 @@ struct worker_pool {
 	atomic_t		nr_running ____cacheline_aligned_in_smp;
 
 	/*
-	 * Destruction of pool is sched-RCU protected to allow dereferences
+	 * Destruction of pool is RCU protected to allow dereferences
 	 * from get_work_pool().
 	 */
 	struct rcu_head		rcu;
@@ -212,7 +212,7 @@ struct pool_workqueue {
 	/*
 	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
 	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
-	 * itself is also sched-RCU protected so that the first pwq can be
+	 * itself is also RCU protected so that the first pwq can be
 	 * determined without grabbing wq->mutex.
 	 */
 	struct work_struct	unbound_release_work;
@@ -266,8 +266,8 @@ struct workqueue_struct {
 	char			name[WQ_NAME_LEN]; /* I: workqueue name */
 
 	/*
-	 * Destruction of workqueue_struct is sched-RCU protected to allow
-	 * walking the workqueues list without grabbing wq_pool_mutex.
+	 * Destruction of workqueue_struct is RCU protected to allow walking
+	 * the workqueues list without grabbing wq_pool_mutex.
 	 * This is used to dump all workqueues from sysrq.
 	 */
 	struct rcu_head		rcu;
@@ -359,20 +359,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
 #include <trace/events/workqueue.h>
 
 #define assert_rcu_or_pool_mutex()					\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq_pool_mutex),		\
-			 "sched RCU or wq_pool_mutex should be held")
+			 "RCU or wq_pool_mutex should be held")
 
 #define assert_rcu_or_wq_mutex(wq)					\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq->mutex),			\
-			 "sched RCU or wq->mutex should be held")
+			 "RCU or wq->mutex should be held")
 
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			 !lockdep_is_held(&wq->mutex) &&		\
 			 !lockdep_is_held(&wq_pool_mutex),		\
-			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
+			 "RCU, wq->mutex or wq_pool_mutex should be held")
 
 #define for_each_cpu_worker_pool(pool, cpu)				\
 	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
@@ -384,7 +384,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  * @pool: iteration cursor
  * @pi: integer used for iteration
  *
- * This must be called either with wq_pool_mutex held or sched RCU read
+ * This must be called either with wq_pool_mutex held or RCU read
  * locked.  If the pool needs to be used beyond the locking in effect, the
  * caller is responsible for guaranteeing that the pool stays online.
  *
@@ -416,7 +416,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  * @pwq: iteration cursor
  * @wq: the target workqueue
  *
- * This must be called either with wq->mutex held or sched RCU read locked.
+ * This must be called either with wq->mutex held or RCU read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  *
@@ -552,7 +552,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
  * read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
@@ -696,8 +696,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
  * @work: the work item of interest
  *
  * Pools are created and destroyed under wq_pool_mutex, and allows read
- * access under sched-RCU read lock.  As such, this function should be
- * called under wq_pool_mutex or with preemption disabled.
+ * access under RCU read lock.  As such, this function should be
+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
  *
  * All fields of the returned pool are accessible as long as the above
  * mentioned locking is in effect.  If the returned pool needs to be used
@@ -1133,7 +1133,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 {
 	if (pwq) {
 		/*
-		 * As both pwqs and pools are sched-RCU protected, the
+		 * As both pwqs and pools are RCU protected, the
 		 * following lock operations are safe.
 		 */
 		spin_lock_irq(&pwq->pool->lock);
@@ -1261,6 +1261,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
+	rcu_read_lock();
 	/*
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
@@ -1299,10 +1300,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 		set_work_pool_and_keep_pending(work, pool->id);
 
 		spin_unlock(&pool->lock);
+		rcu_read_unlock();
 		return 1;
 	}
 	spin_unlock(&pool->lock);
 fail:
+	rcu_read_unlock();
 	local_irq_restore(*flags);
 	if (work_is_canceling(work))
 		return -ENOENT;
@@ -1416,6 +1419,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	if (unlikely(wq->flags & __WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
+	rcu_read_lock();
 retry:
 	if (req_cpu == WORK_CPU_UNBOUND)
 		cpu = wq_select_unbound_cpu(raw_smp_processor_id());
@@ -1472,10 +1476,8 @@ retry:
 	/* pwq determined, queue */
 	trace_workqueue_queue_work(req_cpu, pwq, work);
 
-	if (WARN_ON(!list_empty(&work->entry))) {
-		spin_unlock(&pwq->pool->lock);
-		return;
-	}
+	if (WARN_ON(!list_empty(&work->entry)))
+		goto out;
 
 	pwq->nr_in_flight[pwq->work_color]++;
 	work_flags = work_color_to_flags(pwq->work_color);
@@ -1493,7 +1495,9 @@ retry:
 
 	insert_work(pwq, work, worklist, work_flags);
 
+out:
 	spin_unlock(&pwq->pool->lock);
+	rcu_read_unlock();
 }
 
 /**
@@ -2975,14 +2979,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 
 	might_sleep();
 
-	local_irq_disable();
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (!pool) {
-		local_irq_enable();
+		rcu_read_unlock();
 		return false;
 	}
 
-	spin_lock(&pool->lock);
+	spin_lock_irq(&pool->lock);
 	/* see the comment in try_to_grab_pending() with the same code */
 	pwq = get_work_pwq(work);
 	if (pwq) {
@@ -3014,10 +3018,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 		lock_map_acquire(&pwq->wq->lockdep_map);
 		lock_map_release(&pwq->wq->lockdep_map);
 	}
-
+	rcu_read_unlock();
 	return true;
 already_gone:
 	spin_unlock_irq(&pool->lock);
+	rcu_read_unlock();
 	return false;
 }
 
@@ -3504,7 +3509,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
  * put_unbound_pool - put a worker_pool
  * @pool: worker_pool to put
  *
- * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
  * safe manner.  get_unbound_pool() calls this function on its failure path
  * and this function should be able to release pools which went through,
  * successfully or not, init_worker_pool().
@@ -3558,7 +3563,7 @@ static void put_unbound_pool(struct worker_pool *pool)
 	del_timer_sync(&pool->idle_timer);
 	del_timer_sync(&pool->mayday_timer);
 
-	/* sched-RCU protected to allow dereferences from get_work_pool() */
+	/* RCU protected to allow dereferences from get_work_pool() */
 	call_rcu(&pool->rcu, rcu_free_pool);
 }
 
@@ -4472,7 +4477,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	struct pool_workqueue *pwq;
 	bool ret;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
+	preempt_disable();
 
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = smp_processor_id();
@@ -4483,7 +4489,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	ret = !list_empty(&pwq->delayed_works);
-	rcu_read_unlock_sched();
+	preempt_enable();
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -4509,15 +4516,15 @@ unsigned int work_busy(struct work_struct *work)
 	if (work_pending(work))
 		ret |= WORK_BUSY_PENDING;
 
-	local_irq_save(flags);
+	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (pool) {
-		spin_lock(&pool->lock);
+		spin_lock_irqsave(&pool->lock, flags);
 		if (find_worker_executing_work(pool, work))
 			ret |= WORK_BUSY_RUNNING;
-		spin_unlock(&pool->lock);
+		spin_unlock_irqrestore(&pool->lock, flags);
 	}
-	local_irq_restore(flags);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -4701,7 +4708,7 @@ void show_workqueue_state(void)
 	unsigned long flags;
 	int pi;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
 
 	pr_info("Showing busy workqueues and worker pools:\n");
 
@@ -4766,7 +4773,7 @@ void show_workqueue_state(void)
 		touch_nmi_watchdog();
 	}
 
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
 }
 
 /* used to show worker information through /proc/PID/{comm,stat,status} */
@@ -5153,16 +5160,16 @@ bool freeze_workqueues_busy(void)
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
 		 */
-		rcu_read_lock_sched();
+		rcu_read_lock();
 		for_each_pwq(pwq, wq) {
 			WARN_ON_ONCE(pwq->nr_active < 0);
 			if (pwq->nr_active) {
 				busy = true;
-				rcu_read_unlock_sched();
+				rcu_read_unlock();
 				goto out_unlock;
 			}
 		}
-		rcu_read_unlock_sched();
+		rcu_read_unlock();
 	}
 out_unlock:
 	mutex_unlock(&wq_pool_mutex);
@@ -5357,7 +5364,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 	const char *delim = "";
 	int node, written = 0;
 
-	rcu_read_lock_sched();
+	get_online_cpus();
+	rcu_read_lock();
 	for_each_node(node) {
 		written += scnprintf(buf + written, PAGE_SIZE - written,
 				     "%s%d:%d", delim, node,
@@ -5365,7 +5373,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
 		delim = " ";
 	}
 	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
+	put_online_cpus();
 
 	return written;
 }
-- 
cgit v1.2.3


From 3b15d09f7e6db44065aaba5fd16dc7420035c5ad Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 28 Feb 2019 13:13:26 +0800
Subject: time: Introduce jiffies64_to_msecs()

there is a similar helper in net/netfilter/nf_tables_api.c,
this maybe become a common request someday, so move it to
time.c

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 kernel/time/time.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index c3f756f8534b..9e3f79d4f5a8 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -783,6 +783,16 @@ u64 jiffies64_to_nsecs(u64 j)
 }
 EXPORT_SYMBOL(jiffies64_to_nsecs);
 
+u64 jiffies64_to_msecs(const u64 j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+	return (MSEC_PER_SEC / HZ) * j;
+#else
+	return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_msecs);
+
 /**
  * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
-- 
cgit v1.2.3


From 699c1868a743f530081f429058616a2dd5d8a4b2 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 8 Apr 2019 12:50:57 -0400
Subject: audit: purge unnecessary list_empty calls

The original conditions that led to the use of list_empty() to optimize
list_for_each_entry_rcu() in auditfilter.c and auditsc.c code have been
removed without removing the list_empty() call, but this code example
has been copied several times.  Remove the unnecessary list_empty()
calls.

Please see upstream github issue
https://github.com/linux-audit/audit-kernel/issues/112

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c |  2 --
 kernel/auditsc.c     | 64 ++++++++++++++++++++++------------------------------
 2 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 63f8b3f26fab..2c3c2f349b23 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1315,8 +1315,6 @@ int audit_filter(int msgtype, unsigned int listtype)
 	int ret = 1; /* Audit by default */
 
 	rcu_read_lock();
-	if (list_empty(&audit_filter_list[listtype]))
-		goto unlock_and_return;
 	list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
 		int i, result = 0;
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 98a98e6dca05..51a2ceb3a1ca 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -771,15 +771,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 		return AUDIT_DISABLED;
 
 	rcu_read_lock();
-	if (!list_empty(list)) {
-		list_for_each_entry_rcu(e, list, list) {
-			if (audit_in_mask(&e->rule, ctx->major) &&
-			    audit_filter_rules(tsk, &e->rule, ctx, NULL,
-					       &state, false)) {
-				rcu_read_unlock();
-				ctx->current_state = state;
-				return state;
-			}
+	list_for_each_entry_rcu(e, list, list) {
+		if (audit_in_mask(&e->rule, ctx->major) &&
+		    audit_filter_rules(tsk, &e->rule, ctx, NULL,
+				       &state, false)) {
+			rcu_read_unlock();
+			ctx->current_state = state;
+			return state;
 		}
 	}
 	rcu_read_unlock();
@@ -798,9 +796,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
 	struct audit_entry *e;
 	enum audit_state state;
 
-	if (list_empty(list))
-		return 0;
-
 	list_for_each_entry_rcu(e, list, list) {
 		if (audit_in_mask(&e->rule, ctx->major) &&
 		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
@@ -808,7 +803,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
 			return 1;
 		}
 	}
-
 	return 0;
 }
 
@@ -1945,18 +1939,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
 		return;
 
 	rcu_read_lock();
-	if (!list_empty(list)) {
-		list_for_each_entry_rcu(e, list, list) {
-			for (i = 0; i < e->rule.field_count; i++) {
-				struct audit_field *f = &e->rule.fields[i];
-
-				if (f->type == AUDIT_FSTYPE
-				    && audit_comparator(inode->i_sb->s_magic,
-							f->op, f->val)
-				    && e->rule.action == AUDIT_NEVER) {
-					rcu_read_unlock();
-					return;
-				}
+	list_for_each_entry_rcu(e, list, list) {
+		for (i = 0; i < e->rule.field_count; i++) {
+			struct audit_field *f = &e->rule.fields[i];
+
+			if (f->type == AUDIT_FSTYPE
+			    && audit_comparator(inode->i_sb->s_magic,
+						f->op, f->val)
+			    && e->rule.action == AUDIT_NEVER) {
+				rcu_read_unlock();
+				return;
 			}
 		}
 	}
@@ -2065,18 +2057,16 @@ void __audit_inode_child(struct inode *parent,
 		return;
 
 	rcu_read_lock();
-	if (!list_empty(list)) {
-		list_for_each_entry_rcu(e, list, list) {
-			for (i = 0; i < e->rule.field_count; i++) {
-				struct audit_field *f = &e->rule.fields[i];
-
-				if (f->type == AUDIT_FSTYPE
-				    && audit_comparator(parent->i_sb->s_magic,
-							f->op, f->val)
-				    && e->rule.action == AUDIT_NEVER) {
-					rcu_read_unlock();
-					return;
-				}
+	list_for_each_entry_rcu(e, list, list) {
+		for (i = 0; i < e->rule.field_count; i++) {
+			struct audit_field *f = &e->rule.fields[i];
+
+			if (f->type == AUDIT_FSTYPE
+			    && audit_comparator(parent->i_sb->s_magic,
+						f->op, f->val)
+			    && e->rule.action == AUDIT_NEVER) {
+				rcu_read_unlock();
+				return;
 			}
 		}
 	}
-- 
cgit v1.2.3


From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 25 Mar 2019 21:32:28 +0200
Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS,
 respectively

%pF and %pf are functionally equivalent to %pS and %ps conversion
specifiers. The former are deprecated, therefore switch the current users
to use the preferred variant.

The changes have been produced by the following command:

	git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \
	while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done

And verifying the result.

Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: sparclinux@vger.kernel.org
Cc: linux-um@lists.infradead.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-acpi@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: drbd-dev@lists.linbit.com
Cc: linux-block@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-nvdimm@lists.01.org
Cc: linux-pci@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: linux-btrfs@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-mm@kvack.org
Cc: ceph-devel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: David Sterba <dsterba@suse.com> (for btrfs)
Acked-by: Mike Rapoport <rppt@linux.ibm.com> (for mm/memblock.c)
Acked-by: Bjorn Helgaas <bhelgaas@google.com> (for drivers/pci)
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/async.c            |  4 ++--
 kernel/events/uprobes.c   |  2 +-
 kernel/fail_function.c    |  2 +-
 kernel/irq/debugfs.c      |  2 +-
 kernel/irq/handle.c       |  2 +-
 kernel/irq/manage.c       |  2 +-
 kernel/irq/spurious.c     |  4 ++--
 kernel/rcu/tree.c         |  2 +-
 kernel/stop_machine.c     |  2 +-
 kernel/time/sched_clock.c |  2 +-
 kernel/time/timer.c       |  2 +-
 kernel/workqueue.c        | 12 ++++++------
 12 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index f6bd0d9885e1..12c332e4e13e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)
 
 	/* 1) run (and print duration) */
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
-		pr_debug("calling  %lli_%pF @ %i\n",
+		pr_debug("calling  %lli_%pS @ %i\n",
 			(long long)entry->cookie,
 			entry->func, task_pid_nr(current));
 		calltime = ktime_get();
@@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
-		pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+		pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
 			(long long)entry->cookie,
 			entry->func,
 			(long long)ktime_to_ns(delta) >> 10);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index affa830a198c..48abc0f18eae 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 		if (uc->handler) {
 			rc = uc->handler(uc, regs);
 			WARN(rc & ~UPROBE_HANDLER_MASK,
-				"bad rc=0x%x from %pf()\n", rc, uc->handler);
+				"bad rc=0x%x from %ps()\n", rc, uc->handler);
 		}
 
 		if (uc->ret_handler)
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..feb80712b913 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
 {
 	struct fei_attr *attr = list_entry(v, struct fei_attr, list);
 
-	seq_printf(m, "%pf\n", attr->kp.addr);
+	seq_printf(m, "%ps\n", attr->kp.addr);
 	return 0;
 }
 
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 516c00a5e867..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
 
 	raw_spin_lock_irq(&desc->lock);
 	data = irq_desc_get_irq_data(desc);
-	seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+	seq_printf(m, "handler:  %ps\n", desc->handle_irq);
 	seq_printf(m, "device:   %s\n", desc->dev_name);
 	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
 	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6df5ddfdb0f8..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
 		res = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, res);
 
-		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
 			      irq, action->handler))
 			local_irq_disable();
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ec34a2a6638..ec43ab2fdfda 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -778,7 +778,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
 		ret = 0;
 		break;
 	default:
-		pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+		pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
 		       flags, irq_desc_get_irq(desc), chip->irq_set_type);
 	}
 	if (unmask)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 	 */
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	for_each_action_of_desc(desc, action) {
-		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+		printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
 		if (action->thread_fn)
-			printk(KERN_CONT " threaded [<%p>] %pf",
+			printk(KERN_CONT " threaded [<%p>] %ps",
 					action->thread_fn, action->thread_fn);
 		printk(KERN_CONT "\n");
 	}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..8eee921b384d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2870,7 +2870,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
 		 * Use rcu:rcu_callback trace event to find the previous
 		 * time callback was passed to __call_rcu().
 		 */
-		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
 			  head, head->func);
 		WRITE_ONCE(head->func, rcu_leak_callback);
 		return;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..7231fb5953fc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -513,7 +513,7 @@ repeat:
 		}
 		preempt_count_dec();
 		WARN_ONCE(preempt_count(),
-			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
+			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
 	}
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..1002cf61700a 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();
 
-	pr_debug("Registered %pF as sched_clock source\n", read);
+	pr_debug("Registered %pS as sched_clock source\n", read);
 }
 
 void __init generic_sched_clock_init(void)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..6502c3ed317e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1328,7 +1328,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
 	lock_map_release(&lockdep_map);
 
 	if (count != preempt_count()) {
-		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+		WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
 			  fn, count, preempt_count());
 		/*
 		 * Restore the preempt count. That gives us a decent
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7abbeed13421..8ea9e4fb8cc6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2277,7 +2277,7 @@ __acquires(&pool->lock)
 
 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
-		       "     last function: %pf\n",
+		       "     last function: %ps\n",
 		       current->comm, preempt_count(), task_pid_nr(current),
 		       worker->current_func);
 		debug_show_held_locks(current);
@@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
 	worker = current_wq_worker();
 
 	WARN_ONCE(current->flags & PF_MEMALLOC,
-		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
 		  current->pid, current->comm, target_wq->name, target_func);
 	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
 			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
-		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+		  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
 		  worker->current_pwq->wq->name, worker->current_func,
 		  target_wq->name, target_func);
 }
@@ -4582,7 +4582,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
 	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
 
 	if (fn || name[0] || desc[0]) {
-		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+		printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
 		if (strcmp(name, desc))
 			pr_cont(" (%s)", desc);
 		pr_cont("\n");
@@ -4607,7 +4607,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)
 		pr_cont("%s BAR(%d)", comma ? "," : "",
 			task_pid_nr(barr->task));
 	} else {
-		pr_cont("%s %pf", comma ? "," : "", work->func);
+		pr_cont("%s %ps", comma ? "," : "", work->func);
 	}
 }
 
@@ -4639,7 +4639,7 @@ static void show_pwq(struct pool_workqueue *pwq)
 			if (worker->current_pwq != pwq)
 				continue;
 
-			pr_cont("%s %d%s:%pf", comma ? "," : "",
+			pr_cont("%s %d%s:%ps", comma ? "," : "",
 				task_pid_nr(worker->task),
 				worker == pwq->wq->rescuer ? "(RESCUER)" : "",
 				worker->current_func);
-- 
cgit v1.2.3


From 6f9b83ac877fb5558d76b9f78590f3afd1bdf421 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 27 Mar 2019 15:35:47 +0100
Subject: cpuidle: Export the next timer expiration for CPUs

To be able to predict the sleep duration for a CPU entering idle, it
is essential to know the expiration time of the next timer.  Both the
teo and the menu cpuidle governors already use this information for
CPU idle state selection.

Moving forward, a similar prediction needs to be made for a group of
idle CPUs rather than for a single one and the following changes
implement a new genpd governor for that purpose.

In order to support that feature, add a new function called
tick_nohz_get_next_hrtimer() that will return the next hrtimer
expiration time of a given CPU to be invoked after deciding
whether or not to stop the scheduler tick on that CPU.

Make the cpuidle core call tick_nohz_get_next_hrtimer() right
before invoking the ->enter() callback provided by the cpuidle
driver for the given state and store its return value in the
per-CPU struct cpuidle_device, so as to make it available to code
outside of cpuidle.

Note that at the point when cpuidle calls tick_nohz_get_next_hrtimer(),
the governor's ->select() callback has already returned and indicated
whether or not the tick should be stopped, so in fact the value
returned by tick_nohz_get_next_hrtimer() always is the next hrtimer
expiration time for the given CPU, possibly including the tick (if
it hasn't been stopped).

Co-developed-by: Lina Iyer <lina.iyer@linaro.org>
Co-developed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/time/tick-sched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..8d18e03124ff 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1022,6 +1022,18 @@ bool tick_nohz_idle_got_tick(void)
 	return false;
 }
 
+/**
+ * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
+ * or the tick, whatever that expires first. Note that, if the tick has been
+ * stopped, it returns the next hrtimer.
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_next_hrtimer(void)
+{
+	return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
+}
+
 /**
  * tick_nohz_get_sleep_length - return the expected length of the current sleep
  * @delta_next: duration until the next event if the tick cannot be stopped
-- 
cgit v1.2.3


From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:03 +0200
Subject: bpf: implement lookup-free direct value access for maps

This generic extension to BPF maps allows for directly loading
an address residing inside a BPF map value as a single BPF
ldimm64 instruction!

The idea is similar to what BPF_PSEUDO_MAP_FD does today, which
is a special src_reg flag for ldimm64 instruction that indicates
that inside the first part of the double insns's imm field is a
file descriptor which the verifier then replaces as a full 64bit
address of the map into both imm parts. For the newly added
BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following:
the first part of the double insns's imm field is again a file
descriptor corresponding to the map, and the second part of the
imm field is an offset into the value. The verifier will then
replace both imm parts with an address that points into the BPF
map value at the given value offset for maps that support this
operation. Currently supported is array map with single entry.
It is possible to support more than just single map element by
reusing both 16bit off fields of the insns as a map index, so
full array map lookup could be expressed that way. It hasn't
been implemented here due to lack of concrete use case, but
could easily be done so in future in a compatible way, since
both off fields right now have to be 0 and would correctly
denote a map index 0.

The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with
BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of
map pointer versus load of map's value at offset 0, and changing
BPF_PSEUDO_MAP_FD's encoding into off by one to differ between
regular map pointer and map value pointer would add unnecessary
complexity and increases barrier for debugability thus less
suitable. Using the second part of the imm field as an offset
into the value does /not/ come with limitations since maximum
possible value size is in u32 universe anyway.

This optimization allows for efficiently retrieving an address
to a map value memory area without having to issue a helper call
which needs to prepare registers according to calling convention,
etc, without needing the extra NULL test, and without having to
add the offset in an additional instruction to the value base
pointer. The verifier then treats the destination register as
PTR_TO_MAP_VALUE with constant reg->off from the user passed
offset from the second imm field, and guarantees that this is
within bounds of the map value. Any subsequent operations are
normally treated as typical map value handling without anything
extra needed from verification side.

The two map operations for direct value access have been added to
array map for now. In future other types could be supported as
well depending on the use case. The main use case for this commit
is to allow for BPF loader support for global variables that
reside in .data/.rodata/.bss sections such that we can directly
load the address of them with minimal additional infrastructure
required. Loader support has been added in subsequent commits for
libbpf library.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c | 32 +++++++++++++++++++
 kernel/bpf/core.c     |  3 +-
 kernel/bpf/disasm.c   |  5 +--
 kernel/bpf/syscall.c  | 28 ++++++++++++-----
 kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++++++++++++++------------
 5 files changed, 124 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c72e0d8e1e65..1a6e9861d554 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	return array->value + array->elem_size * (index & array->index_mask);
 }
 
+static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
+				       u32 off)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+	if (map->max_entries != 1)
+		return -ENOTSUPP;
+	if (off >= map->value_size)
+		return -EINVAL;
+
+	*imm = (unsigned long)array->value;
+	return 0;
+}
+
+static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
+				       u32 *off)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u64 base = (unsigned long)array->value;
+	u64 range = array->elem_size;
+
+	if (map->max_entries != 1)
+		return -ENOTSUPP;
+	if (imm < base || imm >= base + range)
+		return -ENOENT;
+
+	*off = imm - base;
+	return 0;
+}
+
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
@@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_direct_value_addr = array_map_direct_value_addr,
+	.map_direct_value_meta = array_map_direct_value_meta,
 	.map_seq_show_elem = array_map_seq_show_elem,
 	.map_check_btf = array_map_check_btf,
 };
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2966cb368bf4..ace8c22c8b0e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
 		dst[i] = fp->insnsi[i];
 		if (!was_ld_map &&
 		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
-		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+		    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
+		     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
 			was_ld_map = true;
 			dst[i].imm = 0;
 		} else if (was_ld_map &&
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index de73f55e42fd..d9ce383c0f9c 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 			 * part of the ldimm64 insn is accessible.
 			 */
 			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+			bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
+				      insn->src_reg == BPF_PSEUDO_MAP_VALUE;
 			char tmp[64];
 
-			if (map_ptr && !allow_ptr_leaks)
+			if (is_ptr && !allow_ptr_leaks)
 				imm = 0;
 
 			verbose(cbs->private_data, "(%02x) r%d = %s\n",
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d65e56594db..828518bb947b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 }
 
 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
-					      unsigned long addr)
+					      unsigned long addr, u32 *off,
+					      u32 *type)
 {
+	const struct bpf_map *map;
 	int i;
 
-	for (i = 0; i < prog->aux->used_map_cnt; i++)
-		if (prog->aux->used_maps[i] == (void *)addr)
-			return prog->aux->used_maps[i];
+	for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
+		map = prog->aux->used_maps[i];
+		if (map == (void *)addr) {
+			*type = BPF_PSEUDO_MAP_FD;
+			return map;
+		}
+		if (!map->ops->map_direct_value_meta)
+			continue;
+		if (!map->ops->map_direct_value_meta(map, addr, off)) {
+			*type = BPF_PSEUDO_MAP_VALUE;
+			return map;
+		}
+	}
+
 	return NULL;
 }
 
@@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
 {
 	const struct bpf_map *map;
 	struct bpf_insn *insns;
+	u32 off, type;
 	u64 imm;
 	int i;
 
@@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
 			continue;
 
 		imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
-		map = bpf_map_from_imm(prog, imm);
+		map = bpf_map_from_imm(prog, imm, &off, &type);
 		if (map) {
-			insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+			insns[i].src_reg = type;
 			insns[i].imm = map->id;
-			insns[i + 1].imm = 0;
+			insns[i + 1].imm = off;
 			continue;
 		}
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48718e1da16d..6ab7a23fc924 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	return 0;
 }
 
-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
-{
-	u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
-
-	return (struct bpf_map *) (unsigned long) imm64;
-}
-
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
+	struct bpf_insn_aux_data *aux = cur_aux(env);
 	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map;
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return 0;
 	}
 
-	/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
-	BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+	map = env->used_maps[aux->map_index];
+	mark_reg_known_zero(env, regs, insn->dst_reg);
+	regs[insn->dst_reg].map_ptr = map;
+
+	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+		regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+		regs[insn->dst_reg].off = aux->map_off;
+		if (map_value_has_spin_lock(map))
+			regs[insn->dst_reg].id = ++env->id_gen;
+	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+		regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+	} else {
+		verbose(env, "bpf verifier is misconfigured\n");
+		return -EINVAL;
+	}
 
-	regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
-	regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
 	return 0;
 }
 
@@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 		}
 
 		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+			struct bpf_insn_aux_data *aux;
 			struct bpf_map *map;
 			struct fd f;
+			u64 addr;
 
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 
-			if (insn->src_reg == 0)
+			if (insn[0].src_reg == 0)
 				/* valid generic load 64-bit imm */
 				goto next_insn;
 
-			if (insn[0].src_reg != BPF_PSEUDO_MAP_FD ||
-			    insn[1].imm != 0) {
-				verbose(env, "unrecognized bpf_ld_imm64 insn\n");
+			/* In final convert_pseudo_ld_imm64() step, this is
+			 * converted into regular 64-bit imm load insn.
+			 */
+			if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
+			     insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
+			    (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
+			     insn[1].imm != 0)) {
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				return err;
 			}
 
-			/* store map pointer inside BPF_LD_IMM64 instruction */
-			insn[0].imm = (u32) (unsigned long) map;
-			insn[1].imm = ((u64) (unsigned long) map) >> 32;
+			aux = &env->insn_aux_data[i];
+			if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+				addr = (unsigned long)map;
+			} else {
+				u32 off = insn[1].imm;
+
+				if (off >= BPF_MAX_VAR_OFF) {
+					verbose(env, "direct value offset of %u is not allowed\n", off);
+					fdput(f);
+					return -EINVAL;
+				}
+
+				if (!map->ops->map_direct_value_addr) {
+					verbose(env, "no direct value access support for this map type\n");
+					fdput(f);
+					return -EINVAL;
+				}
+
+				err = map->ops->map_direct_value_addr(map, &addr, off);
+				if (err) {
+					verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
+						map->value_size, off);
+					fdput(f);
+					return err;
+				}
+
+				aux->map_off = off;
+				addr += off;
+			}
+
+			insn[0].imm = (u32)addr;
+			insn[1].imm = addr >> 32;
 
 			/* check whether we recorded this map already */
-			for (j = 0; j < env->used_map_cnt; j++)
+			for (j = 0; j < env->used_map_cnt; j++) {
 				if (env->used_maps[j] == map) {
+					aux->map_index = j;
 					fdput(f);
 					goto next_insn;
 				}
+			}
 
 			if (env->used_map_cnt >= MAX_USED_MAPS) {
 				fdput(f);
@@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				fdput(f);
 				return PTR_ERR(map);
 			}
+
+			aux->map_index = env->used_map_cnt;
 			env->used_maps[env->used_map_cnt++] = map;
 
 			if (bpf_map_is_cgroup_storage(map) &&
-- 
cgit v1.2.3


From be70bcd53de66e86f2726e576307cbdaebd3b1a5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:04 +0200
Subject: bpf: do not retain flags that are not tied to map lifetime

Both BPF_F_WRONLY / BPF_F_RDONLY flags are tied to the map file
descriptor, but not to the map object itself! Meaning, at map
creation time BPF_F_RDONLY can be set to make the map read-only
from syscall side, but this holds only for the returned fd, so
any other fd either retrieved via bpf file system or via map id
for the very same underlying map object can have read-write access
instead.

Given that, keeping the two flags around in the map_flags attribute
and exposing them to user space upon map dump is misleading and
may lead to false conclusions. Since these two flags are not
tied to the map object lets also not store them as map property.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 828518bb947b..56b4b0e08b3b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -166,13 +166,25 @@ void bpf_map_area_free(void *area)
 	kvfree(area);
 }
 
+static u32 bpf_map_flags_retain_permanent(u32 flags)
+{
+	/* Some map creation flags are not tied to the map object but
+	 * rather to the map fd instead, so they have no meaning upon
+	 * map object inspection since multiple file descriptors with
+	 * different (access) properties can exist here. Thus, given
+	 * this has zero meaning for the map itself, lets clear these
+	 * from here.
+	 */
+	return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
+}
+
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
 {
 	map->map_type = attr->map_type;
 	map->key_size = attr->key_size;
 	map->value_size = attr->value_size;
 	map->max_entries = attr->max_entries;
-	map->map_flags = attr->map_flags;
+	map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
 	map->numa_node = bpf_map_attr_numa_node(attr);
 }
 
-- 
cgit v1.2.3


From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:05 +0200
Subject: bpf: add program side {rd, wr}only support for maps

This work adds two new map creation flags BPF_F_RDONLY_PROG
and BPF_F_WRONLY_PROG in order to allow for read-only or
write-only BPF maps from a BPF program side.

Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only
applies to system call side, meaning the BPF program has full
read/write access to the map as usual while bpf(2) calls with
map fd can either only read or write into the map depending
on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows
for the exact opposite such that verifier is going to reject
program loads if write into a read-only map or a read into a
write-only map is detected. For read-only map case also some
helpers are forbidden for programs that would alter the map
state such as map deletion, update, etc. As opposed to the two
BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well
as BPF_F_WRONLY_PROG really do correspond to the map lifetime.

We've enabled this generic map extension to various non-special
maps holding normal user data: array, hash, lru, lpm, local
storage, queue and stack. Further generic map types could be
followed up in future depending on use-case. Main use case
here is to forbid writes into .rodata map values from verifier
side.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c         |  6 +++++-
 kernel/bpf/hashtab.c          |  6 +++---
 kernel/bpf/local_storage.c    |  6 +++---
 kernel/bpf/lpm_trie.c         |  3 ++-
 kernel/bpf/queue_stack_maps.c |  6 +++---
 kernel/bpf/syscall.c          |  2 ++
 kernel/bpf/verifier.c         | 46 +++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1a6e9861d554..217b10bd9f48 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -22,7 +22,7 @@
 #include "map_in_map.h"
 
 #define ARRAY_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
@@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr)
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    attr->value_size == 0 ||
 	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+	    !bpf_map_flags_access_ok(attr->map_flags) ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return -EINVAL;
 
@@ -472,6 +473,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr)
 	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return -EINVAL;
+	/* Program read-only/write-only not supported for special maps yet. */
+	if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
+		return -EINVAL;
 	return array_map_alloc_check(attr);
 }
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fed15cf94dca..192d32e77db3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK						\
 	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
-	 BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
+	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
 
 struct bucket {
 	struct hlist_nulls_head head;
@@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 		/* Guard against local DoS, and discourage production use. */
 		return -EPERM;
 
-	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
-		/* reserved bits should not be used */
+	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
+	    !bpf_map_flags_access_ok(attr->map_flags))
 		return -EINVAL;
 
 	if (!lru && percpu_lru)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 6b572e2de7fb..980e8f1f6cb5 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO
 #ifdef CONFIG_CGROUP_BPF
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 
 struct bpf_cgroup_storage_map {
 	struct bpf_map map;
@@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	if (attr->value_size > PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
-		/* reserved bits should not be used */
+	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
+	    !bpf_map_flags_access_ok(attr->map_flags))
 		return ERR_PTR(-EINVAL);
 
 	if (attr->max_entries)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 93a5cbbde421..e61630c2e50b 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -538,7 +538,7 @@ out:
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
 #define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
-				 BPF_F_RDONLY | BPF_F_WRONLY)
+				 BPF_F_ACCESS_MASK)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
@@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	if (attr->max_entries == 0 ||
 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
 	    attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
+	    !bpf_map_flags_access_ok(attr->map_flags) ||
 	    attr->key_size < LPM_KEY_SIZE_MIN ||
 	    attr->key_size > LPM_KEY_SIZE_MAX ||
 	    attr->value_size < LPM_VAL_SIZE_MIN ||
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..0b140d236889 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,8 +11,7 @@
 #include "percpu_freelist.h"
 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
+	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 
 struct bpf_queue_stack {
 	struct bpf_map map;
@@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 0 ||
 	    attr->value_size == 0 ||
-	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
+	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK ||
+	    !bpf_map_flags_access_ok(attr->map_flags))
 		return -EINVAL;
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 56b4b0e08b3b..0c9276b54c88 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -501,6 +501,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	map->spin_lock_off = btf_find_spin_lock(btf, value_type);
 
 	if (map_value_has_spin_lock(map)) {
+		if (map->map_flags & BPF_F_RDONLY_PROG)
+			return -EACCES;
 		if (map->map_type != BPF_MAP_TYPE_HASH &&
 		    map->map_type != BPF_MAP_TYPE_ARRAY &&
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ab7a23fc924..b747434df89c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1439,6 +1439,28 @@ static int check_stack_access(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+				 int off, int size, enum bpf_access_type type)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
+	u32 cap = bpf_map_flags_to_cap(map);
+
+	if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
+		verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
+			map->value_size, off, size);
+		return -EACCES;
+	}
+
+	if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
+		verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
+			map->value_size, off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			      int size, bool zero_size_allowed)
@@ -2024,7 +2046,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
-
+		err = check_map_access_type(env, regno, off, size, t);
+		if (err)
+			return err;
 		err = check_map_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
@@ -2327,6 +2351,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
 	case PTR_TO_MAP_VALUE:
+		if (check_map_access_type(env, regno, reg->off, access_size,
+					  meta && meta->raw_mode ? BPF_WRITE :
+					  BPF_READ))
+			return -EACCES;
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
@@ -3059,6 +3087,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 		int func_id, int insn_idx)
 {
 	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+	struct bpf_map *map = meta->map_ptr;
 
 	if (func_id != BPF_FUNC_tail_call &&
 	    func_id != BPF_FUNC_map_lookup_elem &&
@@ -3069,11 +3098,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_peek_elem)
 		return 0;
 
-	if (meta->map_ptr == NULL) {
+	if (map == NULL) {
 		verbose(env, "kernel subsystem misconfigured verifier\n");
 		return -EINVAL;
 	}
 
+	/* In case of read-only, some additional restrictions
+	 * need to be applied in order to prevent altering the
+	 * state of the map from program side.
+	 */
+	if ((map->map_flags & BPF_F_RDONLY_PROG) &&
+	    (func_id == BPF_FUNC_map_delete_elem ||
+	     func_id == BPF_FUNC_map_update_elem ||
+	     func_id == BPF_FUNC_map_push_elem ||
+	     func_id == BPF_FUNC_map_pop_elem)) {
+		verbose(env, "write into map forbidden\n");
+		return -EACCES;
+	}
+
 	if (!BPF_MAP_PTR(aux->map_state))
 		bpf_map_ptr_store(aux, meta->map_ptr,
 				  meta->map_ptr->unpriv_array);
-- 
cgit v1.2.3


From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:06 +0200
Subject: bpf: add syscall side map freeze support

This patch adds a new BPF_MAP_FREEZE command which allows to
"freeze" the map globally as read-only / immutable from syscall
side.

Map permission handling has been refactored into map_get_sys_perms()
and drops FMODE_CAN_WRITE in case of locked map. Main use case is
to allow for setting up .rodata sections from the BPF ELF which
are loaded into the kernel, meaning BPF loader first allocates
map, sets up map value by copying .rodata section into it and once
complete, it calls BPF_MAP_FREEZE on the map fd to prevent further
modifications.

Right now BPF_MAP_FREEZE only takes map fd as argument while remaining
bpf_attr members are required to be zero. I didn't add write-only
locking here as counterpart since I don't have a concrete use-case
for it on my side, and I think it makes probably more sense to wait
once there is actually one. In that case bpf_attr can be extended
as usual with a flag field and/or others where flag 0 means that
we lock the map read-only hence this doesn't prevent to add further
extensions to BPF_MAP_FREEZE upon need.

A map creation flag like BPF_F_WRONCE was not considered for couple
of reasons: i) in case of a generic implementation, a map can consist
of more than just one element, thus there could be multiple map
updates needed to set the map into a state where it can then be
made immutable, ii) WRONCE indicates exact one-time write before
it is then set immutable. A generic implementation would set a bit
atomically on map update entry (if unset), indicating that every
subsequent update from then onwards will need to bail out there.
However, map updates can fail, so upon failure that flag would need
to be unset again and the update attempt would need to be repeated
for it to be eventually made immutable. While this can be made
race-free, this approach feels less clean and in combination with
reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE
command directly sets the flag and caller has the guarantee that
map is immutable from syscall side upon successful return for any
future syscall invocations that would alter the map state, which
is also more intuitive from an API point of view. A command name
such as BPF_MAP_LOCK has been avoided as it's too close with BPF
map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE
is so far only enabled for privileged users.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0c9276b54c88..b3ce516e5a20 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -355,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
+{
+	fmode_t mode = f.file->f_mode;
+
+	/* Our file permissions may have been overridden by global
+	 * map permissions facing syscall side.
+	 */
+	if (READ_ONCE(map->frozen))
+		mode &= ~FMODE_CAN_WRITE;
+	return mode;
+}
+
 #ifdef CONFIG_PROC_FS
 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
@@ -376,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
 		   "memlock:\t%llu\n"
-		   "map_id:\t%u\n",
+		   "map_id:\t%u\n"
+		   "frozen:\t%u\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
 		   map->pages * 1ULL << PAGE_SHIFT,
-		   map->id);
+		   map->id,
+		   READ_ONCE(map->frozen));
 
 	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
@@ -727,8 +741,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -857,8 +870,7 @@ static int map_update_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -969,8 +981,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1021,8 +1032,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1089,8 +1099,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1132,6 +1141,36 @@ err_put:
 	return err;
 }
 
+#define BPF_MAP_FREEZE_LAST_FIELD map_fd
+
+static int map_freeze(const union bpf_attr *attr)
+{
+	int err = 0, ufd = attr->map_fd;
+	struct bpf_map *map;
+	struct fd f;
+
+	if (CHECK_ATTR(BPF_MAP_FREEZE))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (READ_ONCE(map->frozen)) {
+		err = -EBUSY;
+		goto err_put;
+	}
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	WRITE_ONCE(map->frozen, true);
+err_put:
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2735,6 +2774,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_NEXT_KEY:
 		err = map_get_next_key(&attr);
 		break;
+	case BPF_MAP_FREEZE:
+		err = map_freeze(&attr);
+		break;
 	case BPF_PROG_LOAD:
 		err = bpf_prog_load(&attr, uattr);
 		break;
-- 
cgit v1.2.3


From 3e0ddc4f3ff1436970e96e76f3df3c3b5f5173b6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:07 +0200
Subject: bpf: allow . char as part of the object name

Trivial addition to allow '.' aside from '_' as "special" characters
in the object name. Used to allow for substrings in maps from loader
side such as ".bss", ".data", ".rodata", but could also be useful for
other purposes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b3ce516e5a20..198c9680bf0d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -474,10 +474,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	const char *end = src + BPF_OBJ_NAME_LEN;
 
 	memset(dst, 0, BPF_OBJ_NAME_LEN);
-
-	/* Copy all isalnum() and '_' char */
+	/* Copy all isalnum(), '_' and '.' chars. */
 	while (src < end && *src) {
-		if (!isalnum(*src) && *src != '_')
+		if (!isalnum(*src) &&
+		    *src != '_' && *src != '.')
 			return -EINVAL;
 		*dst++ = *src++;
 	}
-- 
cgit v1.2.3


From 1dc92851849cc2235a1efef8f8d5a9255efc5f13 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:09 +0200
Subject: bpf: kernel side support for BTF Var and DataSec

This work adds kernel-side verification, logging and seq_show dumping
of BTF Var and DataSec kinds which are emitted with latest LLVM. The
following constraints apply:

BTF Var must have:

- Its kind_flag is 0
- Its vlen is 0
- Must point to a valid type
- Type must not resolve to a forward type
- Size of underlying type must be > 0
- Must have a valid name
- Can only be a source type, not sink or intermediate one
- Name may include dots (e.g. in case of static variables
  inside functions)
- Cannot be a member of a struct/union
- Linkage so far can either only be static or global/allocated

BTF DataSec must have:

- Its kind_flag is 0
- Its vlen cannot be 0
- Its size cannot be 0
- Must have a valid name
- Can only be a source type, not sink or intermediate one
- Name may include dots (e.g. to represent .bss, .data, .rodata etc)
- Cannot be a member of a struct/union
- Inner btf_var_secinfo array with {type,offset,size} triple
  must be sorted by offset in ascending order
- Type must always point to BTF Var
- BTF resolved size of Var must be <= size provided by triple
- DataSec size must be >= sum of triple sizes (thus holes
  are allowed)

btf_var_resolve(), btf_ptr_resolve() and btf_modifier_resolve()
are on a high level quite similar but each come with slight,
subtle differences. They could potentially be a bit refactored
in future which hasn't been done here to ease review.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 397 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bd3921b1514b..0cecf6bab61b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -185,6 +185,16 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
+#define for_each_vsi(i, struct_type, member)			\
+	for (i = 0, member = btf_type_var_secinfo(struct_type);	\
+	     i < btf_type_vlen(struct_type);			\
+	     i++, member++)
+
+#define for_each_vsi_from(i, from, struct_type, member)				\
+	for (i = from, member = btf_type_var_secinfo(struct_type) + from;	\
+	     i < btf_type_vlen(struct_type);					\
+	     i++, member++)
+
 static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
@@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_RESTRICT]	= "RESTRICT",
 	[BTF_KIND_FUNC]		= "FUNC",
 	[BTF_KIND_FUNC_PROTO]	= "FUNC_PROTO",
+	[BTF_KIND_VAR]		= "VAR",
+	[BTF_KIND_DATASEC]	= "DATASEC",
 };
 
 struct btf_kind_operations {
@@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
 }
 
+static bool btf_type_is_var(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+}
+
+static bool btf_type_is_datasec(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+}
+
+/* Types that act only as a source, not sink or intermediate
+ * type when resolving.
+ */
+static bool btf_type_is_resolve_source_only(const struct btf_type *t)
+{
+	return btf_type_is_var(t) ||
+	       btf_type_is_datasec(t);
+}
+
 /* What types need to be resolved?
  *
  * btf_type_is_modifier() is an obvious one.
  *
  * btf_type_is_struct() because its member refers to
  * another type (through member->type).
-
+ *
+ * btf_type_is_var() because the variable refers to
+ * another type. btf_type_is_datasec() holds multiple
+ * btf_type_is_var() types that need resolving.
+ *
  * btf_type_is_array() because its element (array->type)
  * refers to another type.  Array can be thought of a
  * special case of struct while array just has the same
@@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t)
 static bool btf_type_needs_resolve(const struct btf_type *t)
 {
 	return btf_type_is_modifier(t) ||
-		btf_type_is_ptr(t) ||
-		btf_type_is_struct(t) ||
-		btf_type_is_array(t);
+	       btf_type_is_ptr(t) ||
+	       btf_type_is_struct(t) ||
+	       btf_type_is_array(t) ||
+	       btf_type_is_var(t) ||
+	       btf_type_is_datasec(t);
 }
 
 /* t->size can be used */
@@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t)
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION:
 	case BTF_KIND_ENUM:
+	case BTF_KIND_DATASEC:
 		return true;
 	}
 
@@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t)
 	return (const struct btf_enum *)(t + 1);
 }
 
+static const struct btf_var *btf_type_var(const struct btf_type *t)
+{
+	return (const struct btf_var *)(t + 1);
+}
+
+static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
+{
+	return (const struct btf_var_secinfo *)(t + 1);
+}
+
 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 {
 	return kind_ops[BTF_INFO_KIND(t->info)];
@@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 		offset < btf->hdr.str_len;
 }
 
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+{
+	if ((first ? !isalpha(c) :
+		     !isalnum(c)) &&
+	    c != '_' &&
+	    ((c == '.' && !dot_ok) ||
+	      c != '.'))
+		return false;
+	return true;
+}
+
+static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
 {
 	/* offset must be valid */
 	const char *src = &btf->strings[offset];
 	const char *src_limit;
 
-	if (!isalpha(*src) && *src != '_')
+	if (!__btf_name_char_ok(*src, true, dot_ok))
 		return false;
 
 	/* set a limit on identifier length */
 	src_limit = src + KSYM_NAME_LEN;
 	src++;
 	while (*src && src < src_limit) {
-		if (!isalnum(*src) && *src != '_')
+		if (!__btf_name_char_ok(*src, false, dot_ok))
 			return false;
 		src++;
 	}
@@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
 	return !*src;
 }
 
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+	return __btf_name_valid(btf, offset, false);
+}
+
+static bool btf_name_valid_section(const struct btf *btf, u32 offset)
+{
+	return __btf_name_valid(btf, offset, true);
+}
+
 static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!offset)
@@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "\n");
 }
 
+__printf(4, 5)
+static void btf_verifier_log_vsi(struct btf_verifier_env *env,
+				 const struct btf_type *datasec_type,
+				 const struct btf_var_secinfo *vsi,
+				 const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+	if (env->phase != CHECK_META)
+		btf_verifier_log_type(env, datasec_type, NULL);
+
+	__btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
+			   vsi->type, vsi->offset, vsi->size);
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
 static void btf_verifier_log_hdr(struct btf_verifier_env *env,
 				 u32 btf_data_size)
 {
@@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 	} else if (btf_type_is_ptr(size_type)) {
 		size = sizeof(void *);
 	} else {
-		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
+				 !btf_type_is_var(size_type)))
 			return NULL;
 
 		size = btf->resolved_sizes[size_type_id];
@@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	u32 next_type_size = 0;
 
 	next_type = btf_type_by_id(btf, next_type_id);
-	if (!next_type) {
+	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
 		btf_verifier_log_type(env, v->t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_var_resolve(struct btf_verifier_env *env,
+			   const struct resolve_vertex *v)
+{
+	const struct btf_type *next_type;
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	if (btf_type_is_modifier(next_type)) {
+		const struct btf_type *resolved_type;
+		u32 resolved_type_id;
+
+		resolved_type_id = next_type_id;
+		resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+		if (btf_type_is_ptr(resolved_type) &&
+		    !env_type_is_resolve_sink(env, resolved_type) &&
+		    !env_type_is_resolved(env, resolved_type_id))
+			return env_stack_push(env, resolved_type,
+					      resolved_type_id);
+	}
+
+	/* We must resolve to something concrete at this point, no
+	 * forward types or similar that would resolve to size of
+	 * zero is allowed.
+	 */
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+	return 0;
+}
+
 static int btf_ptr_resolve(struct btf_verifier_env *env,
 			   const struct resolve_vertex *v)
 {
@@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	struct btf *btf = env->btf;
 
 	next_type = btf_type_by_id(btf, next_type_id);
-	if (!next_type) {
+	if (!next_type || btf_type_is_resolve_source_only(next_type)) {
 		btf_verifier_log_type(env, v->t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf,
 	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
 }
 
+static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	t = btf_type_id_resolve(btf, &type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
 static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
 			     u32 type_id, void *data, u8 bits_offset,
 			     struct seq_file *m)
@@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 	/* Check array->index_type */
 	index_type_id = array->index_type;
 	index_type = btf_type_by_id(btf, index_type_id);
-	if (btf_type_nosize_or_null(index_type)) {
+	if (btf_type_is_resolve_source_only(index_type) ||
+	    btf_type_nosize_or_null(index_type)) {
 		btf_verifier_log_type(env, v->t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 	/* Check array->type */
 	elem_type_id = array->type;
 	elem_type = btf_type_by_id(btf, elem_type_id);
-	if (btf_type_nosize_or_null(elem_type)) {
+	if (btf_type_is_resolve_source_only(elem_type) ||
+	    btf_type_nosize_or_null(elem_type)) {
 		btf_verifier_log_type(env, v->t,
 				      "Invalid elem");
 		return -EINVAL;
@@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 		const struct btf_type *member_type = btf_type_by_id(env->btf,
 								member_type_id);
 
-		if (btf_type_nosize_or_null(member_type)) {
+		if (btf_type_is_resolve_source_only(member_type) ||
+		    btf_type_nosize_or_null(member_type)) {
 			btf_verifier_log_member(env, v->t, member,
 						"Invalid member");
 			return -EINVAL;
@@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = {
 	.seq_show = btf_df_seq_show,
 };
 
+static s32 btf_var_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	const struct btf_var *var;
+	u32 meta_needed = sizeof(*var);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
+	if (!t->name_off ||
+	    !__btf_name_valid(env->btf, t->name_off, true)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	/* A var cannot be in type void */
+	if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	var = btf_type_var(t);
+	if (var->linkage != BTF_VAR_STATIC &&
+	    var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
+		btf_verifier_log_type(env, t, "Linkage not supported");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+	const struct btf_var *var = btf_type_var(t);
+
+	btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
+}
+
+static const struct btf_kind_operations var_ops = {
+	.check_meta		= btf_var_check_meta,
+	.resolve		= btf_var_resolve,
+	.check_member		= btf_df_check_member,
+	.check_kflag_member	= btf_df_check_kflag_member,
+	.log_details		= btf_var_log,
+	.seq_show		= btf_var_seq_show,
+};
+
+static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
+				  const struct btf_type *t,
+				  u32 meta_left)
+{
+	const struct btf_var_secinfo *vsi;
+	u64 last_vsi_end_off = 0, sum = 0;
+	u32 i, meta_needed;
+
+	meta_needed = btf_type_vlen(t) * sizeof(*vsi);
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (!btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen == 0");
+		return -EINVAL;
+	}
+
+	if (!t->size) {
+		btf_verifier_log_type(env, t, "size == 0");
+		return -EINVAL;
+	}
+
+	if (btf_type_kflag(t)) {
+		btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+		return -EINVAL;
+	}
+
+	if (!t->name_off ||
+	    !btf_name_valid_section(env->btf, t->name_off)) {
+		btf_verifier_log_type(env, t, "Invalid name");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for_each_vsi(i, t, vsi) {
+		/* A var cannot be in type void */
+		if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
+			btf_verifier_log_vsi(env, t, vsi,
+					     "Invalid type_id");
+			return -EINVAL;
+		}
+
+		if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
+			btf_verifier_log_vsi(env, t, vsi,
+					     "Invalid offset");
+			return -EINVAL;
+		}
+
+		if (!vsi->size || vsi->size > t->size) {
+			btf_verifier_log_vsi(env, t, vsi,
+					     "Invalid size");
+			return -EINVAL;
+		}
+
+		last_vsi_end_off = vsi->offset + vsi->size;
+		if (last_vsi_end_off > t->size) {
+			btf_verifier_log_vsi(env, t, vsi,
+					     "Invalid offset+size");
+			return -EINVAL;
+		}
+
+		btf_verifier_log_vsi(env, t, vsi, NULL);
+		sum += vsi->size;
+	}
+
+	if (t->size < sum) {
+		btf_verifier_log_type(env, t, "Invalid btf_info size");
+		return -EINVAL;
+	}
+
+	return meta_needed;
+}
+
+static int btf_datasec_resolve(struct btf_verifier_env *env,
+			       const struct resolve_vertex *v)
+{
+	const struct btf_var_secinfo *vsi;
+	struct btf *btf = env->btf;
+	u16 i;
+
+	for_each_vsi_from(i, v->next_member, v->t, vsi) {
+		u32 var_type_id = vsi->type, type_id, type_size = 0;
+		const struct btf_type *var_type = btf_type_by_id(env->btf,
+								 var_type_id);
+		if (!var_type || !btf_type_is_var(var_type)) {
+			btf_verifier_log_vsi(env, v->t, vsi,
+					     "Not a VAR kind member");
+			return -EINVAL;
+		}
+
+		if (!env_type_is_resolve_sink(env, var_type) &&
+		    !env_type_is_resolved(env, var_type_id)) {
+			env_stack_set_next_member(env, i + 1);
+			return env_stack_push(env, var_type, var_type_id);
+		}
+
+		type_id = var_type->type;
+		if (!btf_type_id_size(btf, &type_id, &type_size)) {
+			btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
+			return -EINVAL;
+		}
+
+		if (vsi->size < type_size) {
+			btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
+			return -EINVAL;
+		}
+	}
+
+	env_stack_pop_resolved(env, 0, 0);
+	return 0;
+}
+
+static void btf_datasec_log(struct btf_verifier_env *env,
+			    const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_datasec_seq_show(const struct btf *btf,
+				 const struct btf_type *t, u32 type_id,
+				 void *data, u8 bits_offset,
+				 struct seq_file *m)
+{
+	const struct btf_var_secinfo *vsi;
+	const struct btf_type *var;
+	u32 i;
+
+	seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+	for_each_vsi(i, t, vsi) {
+		var = btf_type_by_id(btf, vsi->type);
+		if (i)
+			seq_puts(m, ",");
+		btf_type_ops(var)->seq_show(btf, var, vsi->type,
+					    data + vsi->offset, bits_offset, m);
+	}
+	seq_puts(m, "}");
+}
+
+static const struct btf_kind_operations datasec_ops = {
+	.check_meta		= btf_datasec_check_meta,
+	.resolve		= btf_datasec_resolve,
+	.check_member		= btf_df_check_member,
+	.check_kflag_member	= btf_df_check_kflag_member,
+	.log_details		= btf_datasec_log,
+	.seq_show		= btf_datasec_seq_show,
+};
+
 static int btf_func_proto_check(struct btf_verifier_env *env,
 				const struct btf_type *t)
 {
@@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
 	[BTF_KIND_RESTRICT] = &modifier_ops,
 	[BTF_KIND_FUNC] = &func_ops,
 	[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
+	[BTF_KIND_VAR] = &var_ops,
+	[BTF_KIND_DATASEC] = &datasec_ops,
 };
 
 static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
 	if (!env_type_is_resolved(env, type_id))
 		return false;
 
-	if (btf_type_is_struct(t))
+	if (btf_type_is_struct(t) || btf_type_is_datasec(t))
 		return !btf->resolved_ids[type_id] &&
-			!btf->resolved_sizes[type_id];
+		       !btf->resolved_sizes[type_id];
 
-	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+	if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
+	    btf_type_is_var(t)) {
 		t = btf_type_id_resolve(btf, &type_id);
-		return t && !btf_type_is_modifier(t);
+		return t &&
+		       !btf_type_is_modifier(t) &&
+		       !btf_type_is_var(t) &&
+		       !btf_type_is_datasec(t);
 	}
 
 	if (btf_type_is_array(t)) {
-- 
cgit v1.2.3


From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:10 +0200
Subject: bpf: allow for key-less BTF in array map

Given we'll be reusing BPF array maps for global data/bss/rodata
sections, we need a way to associate BTF DataSec type as its map
value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR()
macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR")
to get initial map to type association going. While more use cases
for it are discouraged, this also won't work for global data since
the use of array map is a BPF loader detail and therefore unknown
at compilation time. For array maps with just a single entry we make
an exception in terms of BTF in that key type is declared optional
if value type is of DataSec type. The latter LLVM is guaranteed to
emit and it also aligns with how we regard global data maps as just
a plain buffer area reusing existing map facilities for allowing
things like introspection with existing tools.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c | 15 ++++++++++++++-
 kernel/bpf/btf.c      |  2 +-
 kernel/bpf/syscall.c  | 15 +++++++++++----
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 217b10bd9f48..584636c9e2eb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -391,7 +391,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 		return;
 	}
 
-	seq_printf(m, "%u: ", *(u32 *)key);
+	if (map->btf_key_type_id)
+		seq_printf(m, "%u: ", *(u32 *)key);
 	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
 	seq_puts(m, "\n");
 
@@ -428,6 +429,18 @@ static int array_map_check_btf(const struct bpf_map *map,
 {
 	u32 int_data;
 
+	/* One exception for keyless BTF: .bss/.data/.rodata map */
+	if (btf_type_is_void(key_type)) {
+		if (map->map_type != BPF_MAP_TYPE_ARRAY ||
+		    map->max_entries != 1)
+			return -EINVAL;
+
+		if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
+			return -EINVAL;
+
+		return 0;
+	}
+
 	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
 		return -EINVAL;
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0cecf6bab61b..cad09858a5f2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -326,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
 	return false;
 }
 
-static bool btf_type_is_void(const struct btf_type *t)
+bool btf_type_is_void(const struct btf_type *t)
 {
 	return t == &btf_void;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 198c9680bf0d..438199e2eca4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -504,9 +504,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	u32 key_size, value_size;
 	int ret = 0;
 
-	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
-	if (!key_type || key_size != map->key_size)
-		return -EINVAL;
+	/* Some maps allow key to be unspecified. */
+	if (btf_key_id) {
+		key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+		if (!key_type || key_size != map->key_size)
+			return -EINVAL;
+	} else {
+		key_type = btf_type_by_id(btf, 0);
+		if (!map->ops->map_check_btf)
+			return -EINVAL;
+	}
 
 	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
 	if (!value_type || value_size != map->value_size)
@@ -573,7 +580,7 @@ static int map_create(union bpf_attr *attr)
 	if (attr->btf_key_type_id || attr->btf_value_type_id) {
 		struct btf *btf;
 
-		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+		if (!attr->btf_value_type_id) {
 			err = -EINVAL;
 			goto free_map_nouncharge;
 		}
-- 
cgit v1.2.3


From d8743230c9f4e92f370ecd2a90c680ddcede6ae5 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Tue, 9 Apr 2019 18:35:46 +0100
Subject: sched/topology: Fix build_sched_groups() comment

The comment was introduced (pre 2.6.12) by:

  8a7a2318dc07 ("[PATCH] sched: consolidate sched domains")

and referred to sched_group->cpu_power. This was folded into
sched_group->sched_group_power in

  commit 9c3f75cbd144 ("sched: Break out cpu_power from the sched_group structure")

The comment was then updated in:

  ced549fa5fc1 ("sched: Remove remaining dubious usage of "power"")

but should have replaced "sg->cpu_capacity" with
"sg->sched_group_capacity". Do that now.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Dietmar.Eggemann@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: morten.rasmussen@arm.com
Cc: qais.yousef@arm.com
Link: http://lkml.kernel.org/r/20190409173546.4747-3-valentin.schneider@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 64bec54ded3e..90e1a870fb0d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1087,8 +1087,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 
 /*
  * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
+ * covered by the given span, will set each group's ->cpumask correctly,
+ * and will initialize their ->sgc.
  *
  * Assumes the sched_domain tree is fully constructed
  */
-- 
cgit v1.2.3


From 67d4f6ff2fb69e02bd6365a91ca3939b7a14deac Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Tue, 9 Apr 2019 18:35:45 +0100
Subject: sched/topology: Skip duplicate group rewrites in build_sched_groups()

While staring at build_sched_domains(), I realized that get_group()
does several duplicate (thus useless) writes.

If you take the Arm Juno r0 (LITTLEs = [0, 3, 4, 5], bigs = [1, 2]), the
sched_group build flow would look like this:

('MC[cpu]->sg' means 'per_cpu_ptr(&tl->data->sg, cpu)' with 'tl == MC')

build_sched_groups(MC[CPU0]->sd, CPU0)
  get_group(0) -> MC[CPU0]->sg
  get_group(3) -> MC[CPU3]->sg
  get_group(4) -> MC[CPU4]->sg
  get_group(5) -> MC[CPU5]->sg

build_sched_groups(DIE[CPU0]->sd, CPU0)
  get_group(0) -> DIE[CPU0]->sg
  get_group(1) -> DIE[CPU1]->sg <=================+
						  |
build_sched_groups(MC[CPU1]->sd, CPU1)            |
  get_group(1) -> MC[CPU1]->sg                    |
  get_group(2) -> MC[CPU2]->sg                    |
						  |
build_sched_groups(DIE[CPU1]->sd, CPU1)           ^
  get_group(1) -> DIE[CPU1]->sg  } We've set up these two up here!
  get_group(3) -> DIE[CPU0]->sg  }

From this point on, we will only use sched_groups that have been
previously visited & initialized. The only new operation will
be which group pointer we affect to sd->groups.

On the Juno r0 we get 32 get_group() calls, every single one of them
writing to a sched_group->cpumask. However, all of the data structures
we need are set up after 8 visits (see above).

Return early from get_group() if we've already visited (and thus
initialized) the sched_group we're looking at. Overlapping domains
are not affected as they do not use build_sched_groups().

Tested on a Juno and a 2 * (Xeon E5-2690) system.

( FWIW I initially checked the refs for both sg && sg->sgc, but figured if
  they weren't both 0 or > 1 then something must have gone wrong, so I
  threw in a WARN_ON(). )

No change in functionality intended.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 90e1a870fb0d..c65b31e9458b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	struct sched_domain *child = sd->child;
 	struct sched_group *sg;
+	bool already_visited;
 
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 	sg = *per_cpu_ptr(sdd->sg, cpu);
 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 
-	/* For claim_allocations: */
-	atomic_inc(&sg->ref);
-	atomic_inc(&sg->sgc->ref);
+	/* Increase refcounts for claim_allocations: */
+	already_visited = atomic_inc_return(&sg->ref) > 1;
+	/* sgc visits should follow a similar trend as sg */
+	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+
+	/* If we have already visited that group, it's already initialized. */
+	if (already_visited)
+		return sg;
 
 	if (child) {
 		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
-- 
cgit v1.2.3


From eecec78f777742903ec9167490c625661284155d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:10 -0400
Subject: locking/rwsem: Relocate rwsem_down_read_failed()

The rwsem_down_read_failed*() functions were relocated from above the
optimistic spinning section to below that section. This enables the
reader functions to use optimisitic spinning in future patches. There
is no code change.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 172 ++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..0d518b52ade4 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -224,92 +224,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 		atomic_long_add(adjustment, &sem->count);
 }
 
-/*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
-	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
-	struct rwsem_waiter waiter;
-	DEFINE_WAKE_Q(wake_q);
-
-	waiter.task = current;
-	waiter.type = RWSEM_WAITING_FOR_READ;
-
-	raw_spin_lock_irq(&sem->wait_lock);
-	if (list_empty(&sem->wait_list)) {
-		/*
-		 * In case the wait queue is empty and the lock isn't owned
-		 * by a writer, this reader can exit the slowpath and return
-		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
-		 * been set in the count.
-		 */
-		if (atomic_long_read(&sem->count) >= 0) {
-			raw_spin_unlock_irq(&sem->wait_lock);
-			return sem;
-		}
-		adjustment += RWSEM_WAITING_BIAS;
-	}
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* we're now waiting on the lock, but no longer actively locking */
-	count = atomic_long_add_return(adjustment, &sem->count);
-
-	/*
-	 * If there are no active locks, wake the front queued process(es).
-	 *
-	 * If there are no writers and we are first in the queue,
-	 * wake our own waiter to join the existing active readers !
-	 */
-	if (count == RWSEM_WAITING_BIAS ||
-	    (count > RWSEM_WAITING_BIAS &&
-	     adjustment != -RWSEM_ACTIVE_READ_BIAS))
-		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
-	raw_spin_unlock_irq(&sem->wait_lock);
-	wake_up_q(&wake_q);
-
-	/* wait to be given the lock */
-	while (true) {
-		set_current_state(state);
-		if (!waiter.task)
-			break;
-		if (signal_pending_state(state, current)) {
-			raw_spin_lock_irq(&sem->wait_lock);
-			if (waiter.task)
-				goto out_nolock;
-			raw_spin_unlock_irq(&sem->wait_lock);
-			break;
-		}
-		schedule();
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return sem;
-out_nolock:
-	list_del(&waiter.list);
-	if (list_empty(&sem->wait_list))
-		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
-	raw_spin_unlock_irq(&sem->wait_lock);
-	__set_current_state(TASK_RUNNING);
-	return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
-	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-
 /*
  * This function must be called with the sem->wait_lock held to prevent
  * race conditions between checking the rwsem wait list and setting the
@@ -504,6 +418,92 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
 }
 #endif
 
+/*
+ * Wait for the read lock to be granted
+ */
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+{
+	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+	struct rwsem_waiter waiter;
+	DEFINE_WAKE_Q(wake_q);
+
+	waiter.task = current;
+	waiter.type = RWSEM_WAITING_FOR_READ;
+
+	raw_spin_lock_irq(&sem->wait_lock);
+	if (list_empty(&sem->wait_list)) {
+		/*
+		 * In case the wait queue is empty and the lock isn't owned
+		 * by a writer, this reader can exit the slowpath and return
+		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
+		 * been set in the count.
+		 */
+		if (atomic_long_read(&sem->count) >= 0) {
+			raw_spin_unlock_irq(&sem->wait_lock);
+			return sem;
+		}
+		adjustment += RWSEM_WAITING_BIAS;
+	}
+	list_add_tail(&waiter.list, &sem->wait_list);
+
+	/* we're now waiting on the lock, but no longer actively locking */
+	count = atomic_long_add_return(adjustment, &sem->count);
+
+	/*
+	 * If there are no active locks, wake the front queued process(es).
+	 *
+	 * If there are no writers and we are first in the queue,
+	 * wake our own waiter to join the existing active readers !
+	 */
+	if (count == RWSEM_WAITING_BIAS ||
+	    (count > RWSEM_WAITING_BIAS &&
+	     adjustment != -RWSEM_ACTIVE_READ_BIAS))
+		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+	raw_spin_unlock_irq(&sem->wait_lock);
+	wake_up_q(&wake_q);
+
+	/* wait to be given the lock */
+	while (true) {
+		set_current_state(state);
+		if (!waiter.task)
+			break;
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter.task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			break;
+		}
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return sem;
+out_nolock:
+	list_del(&waiter.list);
+	if (list_empty(&sem->wait_list))
+		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+	raw_spin_unlock_irq(&sem->wait_lock);
+	__set_current_state(TASK_RUNNING);
+	return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
 /*
  * Wait until we successfully acquire the write lock
  */
-- 
cgit v1.2.3


From c7580c1e84435c9ccc6c612d9fee8e71811f7be6 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:11 -0400
Subject: locking/rwsem: Move owner setting code from rwsem.c to rwsem.h

Move all the owner setting code closer to the rwsem-xadd fast paths
directly within rwsem.h file as well as in the slowpaths where owner
setting is done after acquring the lock. This will enable us to add
DEBUG_RWSEMS check in a later patch to make sure that read lock is
really acquired when rwsem_down_read_failed() returns, for instance.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-3-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c |  6 +++---
 kernel/locking/rwsem.c      | 19 ++-----------------
 kernel/locking/rwsem.h      | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0d518b52ade4..c213869e1aa7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -176,9 +176,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 			goto try_reader_grant;
 		}
 		/*
-		 * It is not really necessary to set it to reader-owned here,
-		 * but it gives the spinners an early indication that the
-		 * readers now have the lock.
+		 * Set it to reader-owned to give spinners an early
+		 * indication that readers now have the lock.
 		 */
 		__rwsem_set_reader_owned(sem, waiter->task);
 	}
@@ -441,6 +440,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 		 */
 		if (atomic_long_read(&sem->count) >= 0) {
 			raw_spin_unlock_irq(&sem->wait_lock);
+			rwsem_set_reader_owned(sem);
 			return sem;
 		}
 		adjustment += RWSEM_WAITING_BIAS;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..59e584895532 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-	rwsem_set_reader_owned(sem);
 }
 
 EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
 		return -EINTR;
 	}
 
-	rwsem_set_reader_owned(sem);
 	return 0;
 }
 
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
 {
 	int ret = __down_read_trylock(sem);
 
-	if (ret == 1) {
+	if (ret == 1)
 		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
-		rwsem_set_reader_owned(sem);
-	}
 	return ret;
 }
 
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
 		return -EINTR;
 	}
 
-	rwsem_set_owner(sem);
 	return 0;
 }
 
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
 	int ret = __down_write_trylock(sem);
 
-	if (ret == 1) {
+	if (ret == 1)
 		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
-		rwsem_set_owner(sem);
-	}
 
 	return ret;
 }
@@ -119,7 +111,6 @@ void up_read(struct rw_semaphore *sem)
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
 
-	rwsem_clear_reader_owned(sem);
 	__up_read(sem);
 }
 
@@ -133,7 +124,6 @@ void up_write(struct rw_semaphore *sem)
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
 	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
-	rwsem_clear_owner(sem);
 	__up_write(sem);
 }
 
@@ -147,7 +137,6 @@ void downgrade_write(struct rw_semaphore *sem)
 	lock_downgrade(&sem->dep_map, _RET_IP_);
 	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
-	rwsem_set_reader_owned(sem);
 	__downgrade_write(sem);
 }
 
@@ -161,7 +150,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-	rwsem_set_reader_owned(sem);
 }
 
 EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +160,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +180,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-	rwsem_set_owner(sem);
 }
 
 EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +194,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
 		return -EINTR;
 	}
 
-	rwsem_set_owner(sem);
 	return 0;
 }
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 1f5775aa6a1d..ee24c4f257a5 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -160,6 +160,8 @@ static inline void __down_read(struct rw_semaphore *sem)
 {
 	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
+	else
+		rwsem_set_reader_owned(sem);
 }
 
 static inline int __down_read_killable(struct rw_semaphore *sem)
@@ -167,8 +169,9 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
 	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
 		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
 			return -EINTR;
+	} else {
+		rwsem_set_reader_owned(sem);
 	}
-
 	return 0;
 }
 
@@ -182,6 +185,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	do {
 		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 					tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			rwsem_set_reader_owned(sem);
 			return 1;
 		}
 	} while (tmp >= 0);
@@ -199,6 +203,7 @@ static inline void __down_write(struct rw_semaphore *sem)
 					     &sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
+	rwsem_set_owner(sem);
 }
 
 static inline int __down_write_killable(struct rw_semaphore *sem)
@@ -210,6 +215,7 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
 			return -EINTR;
+	rwsem_set_owner(sem);
 	return 0;
 }
 
@@ -219,7 +225,11 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 
 	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
-	return tmp == RWSEM_UNLOCKED_VALUE;
+	if (tmp == RWSEM_UNLOCKED_VALUE) {
+		rwsem_set_owner(sem);
+		return true;
+	}
+	return false;
 }
 
 /*
@@ -229,6 +239,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
+	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_dec_return_release(&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
@@ -239,6 +250,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
+	rwsem_clear_owner(sem);
 	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 						    &sem->count) < 0))
 		rwsem_wake(sem);
@@ -259,6 +271,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * write side. As such, rely on RELEASE semantics.
 	 */
 	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
+	rwsem_set_reader_owned(sem);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
 }
-- 
cgit v1.2.3


From 12a30a7fc142a123c61da9623bd824d95d36c12e Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:12 -0400
Subject: locking/rwsem: Move rwsem internal function declarations to
 rwsem-xadd.h

We don't need to expose rwsem internal functions which are not supposed
to be called directly from other kernel code.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-4-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index ee24c4f257a5..19997c82270b 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -153,6 +153,13 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 }
 #endif
 
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+
 /*
  * lock for reading
  */
-- 
cgit v1.2.3


From a338ecb07a338c9a8b0ca0010e862ebe598b1551 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:13 -0400
Subject: locking/rwsem: Micro-optimize rwsem_try_read_lock_unqueued()

The atomic_long_cmpxchg_acquire() in rwsem_try_read_lock_unqueued() is
replaced by atomic_long_try_cmpxchg_acquire() to simpify the code and
generate slightly better assembly code.

There is no functional change.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-5-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index c213869e1aa7..f6198e1a58f6 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -259,21 +259,16 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
  */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-	long old, count = atomic_long_read(&sem->count);
+	long count = atomic_long_read(&sem->count);
 
-	while (true) {
-		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
-			return false;
-
-		old = atomic_long_cmpxchg_acquire(&sem->count, count,
-				      count + RWSEM_ACTIVE_WRITE_BIAS);
-		if (old == count) {
+	while (!count || count == RWSEM_WAITING_BIAS) {
+		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+					count + RWSEM_ACTIVE_WRITE_BIAS)) {
 			rwsem_set_owner(sem);
 			return true;
 		}
-
-		count = old;
 	}
+	return false;
 }
 
 static inline bool owner_on_cpu(struct task_struct *owner)
-- 
cgit v1.2.3


From a68e2c4c637918da47b3aa270051545cff7d8245 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:14 -0400
Subject: locking/rwsem: Add debug check for __down_read*()

When rwsem_down_read_failed*() return, the read lock is acquired
indirectly by others. So debug checks are added in __down_read() and
__down_read_killable() to make sure the rwsem is really reader-owned.

The other debug check calls in kernel/locking/rwsem.c except the
one in up_read_non_owner() are also moved over to rwsem-xadd.h.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-6-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.c |  3 ---
 kernel/locking/rwsem.h | 12 ++++++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 59e584895532..90de5f1780ba 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -109,7 +109,6 @@ EXPORT_SYMBOL(down_write_trylock);
 void up_read(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
-	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
 
 	__up_read(sem);
 }
@@ -122,7 +121,6 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, 1, _RET_IP_);
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
 	__up_write(sem);
 }
@@ -135,7 +133,6 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
 	lock_downgrade(&sem->dep_map, _RET_IP_);
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
 	__downgrade_write(sem);
 }
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 19997c82270b..1d8f722a6761 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -165,10 +165,13 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
  */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
 		rwsem_down_read_failed(sem);
-	else
+		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+					RWSEM_READER_OWNED));
+	} else {
 		rwsem_set_reader_owned(sem);
+	}
 }
 
 static inline int __down_read_killable(struct rw_semaphore *sem)
@@ -176,6 +179,8 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
 	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
 		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
 			return -EINTR;
+		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+					RWSEM_READER_OWNED));
 	} else {
 		rwsem_set_reader_owned(sem);
 	}
@@ -246,6 +251,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
+	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
 	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_dec_return_release(&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
@@ -257,6 +263,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 	rwsem_clear_owner(sem);
 	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 						    &sem->count) < 0))
@@ -277,6 +284,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * read-locked region is ok to be re-ordered into the
 	 * write side. As such, rely on RELEASE semantics.
 	 */
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
 	rwsem_set_reader_owned(sem);
 	if (tmp < 0)
-- 
cgit v1.2.3


From 3b4ba6643d26a95e08067fca9a5da1828f9afabf Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:15 -0400
Subject: locking/rwsem: Enhance DEBUG_RWSEMS_WARN_ON() macro

Currently, the DEBUG_RWSEMS_WARN_ON() macro just dumps a stack trace
when the rwsem isn't in the right state. It does not show the actual
states of the rwsem. This may not be that helpful in the debugging
process.

Enhance the DEBUG_RWSEMS_WARN_ON() macro to also show the current
content of the rwsem count and owner fields to give more information
about what is wrong with the rwsem. The debug_locks_off() function is
called as is done inside DEBUG_LOCKS_WARN_ON().

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-7-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.c |  3 ++-
 kernel/locking/rwsem.h | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90de5f1780ba..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -198,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
 
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+				sem);
 	__up_read(sem);
 }
 
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 1d8f722a6761..3059a2dc39f8 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -27,9 +27,15 @@
 #define RWSEM_ANONYMOUSLY_OWNED	(1UL << 1)
 
 #ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c)
+# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
+	if (WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+		#c, atomic_long_read(&(sem)->count),		\
+		(long)((sem)->owner), (long)current,		\
+		list_empty(&(sem)->wait_list) ? "" : "not "))	\
+			debug_locks_off();			\
+	} while (0)
 #else
-# define DEBUG_RWSEMS_WARN_ON(c)
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
 #endif
 
 /*
@@ -168,7 +174,7 @@ static inline void __down_read(struct rw_semaphore *sem)
 	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
 		rwsem_down_read_failed(sem);
 		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-					RWSEM_READER_OWNED));
+					RWSEM_READER_OWNED), sem);
 	} else {
 		rwsem_set_reader_owned(sem);
 	}
@@ -180,7 +186,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
 		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
 			return -EINTR;
 		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-					RWSEM_READER_OWNED));
+					RWSEM_READER_OWNED), sem);
 	} else {
 		rwsem_set_reader_owned(sem);
 	}
@@ -251,7 +257,8 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+				sem);
 	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_dec_return_release(&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
@@ -263,7 +270,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
 	rwsem_clear_owner(sem);
 	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 						    &sem->count) < 0))
@@ -284,7 +291,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 	 * read-locked region is ok to be re-ordered into the
 	 * write side. As such, rely on RELEASE semantics.
 	 */
-	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
+	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
 	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
 	rwsem_set_reader_owned(sem);
 	if (tmp < 0)
-- 
cgit v1.2.3


From ad53fa10fa9e816067bbae7109845940f5e6df50 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:16 -0400
Subject: locking/qspinlock_stat: Introduce generic lockevent_*() counting APIs

The percpu event counts used by qspinlock code can be useful for
other locking code as well. So a new set of lockevent_* counting APIs
is introduced with the lock event names extracted out into the new
lock_events_list.h header file for easier addition in the future.

The existing qstat_inc() calls are replaced by either lockevent_inc() or
lockevent_cond_inc() calls.

The qstat_hop() call is renamed to lockevent_pv_hop(). The "reset_counters"
debugfs file is also renamed to ".reset_counts".

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-8-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lock_events.h        |  55 ++++++++++++
 kernel/locking/lock_events_list.h   |  50 +++++++++++
 kernel/locking/qspinlock.c          |   8 +-
 kernel/locking/qspinlock_paravirt.h |  19 +++--
 kernel/locking/qspinlock_stat.h     | 163 ++++++++++++++----------------------
 5 files changed, 181 insertions(+), 114 deletions(-)
 create mode 100644 kernel/locking/lock_events.h
 create mode 100644 kernel/locking/lock_events_list.h

(limited to 'kernel')

diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..4009e07b474a
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+enum lock_events {
+
+#include "lock_events_list.h"
+
+	lockevent_num,	/* Total number of lock event counts */
+	LOCKEVENT_reset_cnts = lockevent_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+	if (cond)
+		__this_cpu_inc(lockevents[event]);
+}
+
+#define lockevent_inc(ev)	  __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+	__this_cpu_add(lockevents[event], inc);
+}
+
+#define lockevent_add(ev, c)	__lockevent_add(LOCKEVENT_ ##ev, c)
+
+#else  /* CONFIG_QUEUED_LOCK_STAT */
+
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c)
+#define lockevent_cond_inc(ev, c)
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..8b4d2e180475
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name)	LOCKEVENT_ ## name,
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops)	/* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock)	/* # of vCPU kicks issued at unlock time   */
+LOCK_EVENT(pv_kick_wake)	/* # of vCPU kicks for pv_latency_wake	   */
+LOCK_EVENT(pv_latency_kick)	/* Average latency (ns) of vCPU kick	   */
+LOCK_EVENT(pv_latency_wake)	/* Average latency (ns) of kick-to-wakeup  */
+LOCK_EVENT(pv_lock_stealing)	/* # of lock stealing operations	   */
+LOCK_EVENT(pv_spurious_wakeup)	/* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again)	/* # of wait's after queue head vCPU kick  */
+LOCK_EVENT(pv_wait_early)	/* # of early vCPU wait's		   */
+LOCK_EVENT(pv_wait_head)	/* # of vCPU wait's at the queue head	   */
+LOCK_EVENT(pv_wait_node)	/* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending)	/* # of locking ops via pending code	     */
+LOCK_EVENT(lock_slowpath)	/* # of locking ops via MCS lock queue	     */
+LOCK_EVENT(lock_use_node2)	/* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3)	/* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4)	/* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node)	/* # of locking ops w/o using percpu node    */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 * 0,1,0 -> 0,0,1
 	 */
 	clear_pending_set_locked(lock);
-	qstat_inc(qstat_lock_pending, true);
+	lockevent_inc(lock_pending);
 	return;
 
 	/*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 * queuing.
 	 */
 queue:
-	qstat_inc(qstat_lock_slowpath, true);
+	lockevent_inc(lock_slowpath);
 pv_queue:
 	node = this_cpu_ptr(&qnodes[0].mcs);
 	idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
 	 * simple enough.
 	 */
 	if (unlikely(idx >= MAX_NODES)) {
-		qstat_inc(qstat_lock_no_node, true);
+		lockevent_inc(lock_no_node);
 		while (!queued_spin_trylock(lock))
 			cpu_relax();
 		goto release;
@@ -430,7 +430,7 @@ pv_queue:
 	/*
 	 * Keep counts of non-zero index values:
 	 */
-	qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
+	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
 
 	/*
 	 * Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 
 		if (!(val & _Q_LOCKED_PENDING_MASK) &&
 		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
-			qstat_inc(qstat_pv_lock_stealing, true);
+			lockevent_inc(pv_lock_stealing);
 			return true;
 		}
 		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 		hopcnt++;
 		if (!cmpxchg(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
-			qstat_hop(hopcnt);
+			lockevent_pv_hop(hopcnt);
 			return &he->lock;
 		}
 	}
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		smp_store_mb(pn->state, vcpu_halted);
 
 		if (!READ_ONCE(node->locked)) {
-			qstat_inc(qstat_pv_wait_node, true);
-			qstat_inc(qstat_pv_wait_early, wait_early);
+			lockevent_inc(pv_wait_node);
+			lockevent_cond_inc(pv_wait_early, wait_early);
 			pv_wait(&pn->state, vcpu_halted);
 		}
 
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		 * So it is better to spin for a while in the hope that the
 		 * MCS lock will be released soon.
 		 */
-		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+		lockevent_cond_inc(pv_spurious_wakeup,
+				  !READ_ONCE(node->locked));
 	}
 
 	/*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 	/*
 	 * Tracking # of slowpath locking operations
 	 */
-	qstat_inc(qstat_lock_slowpath, true);
+	lockevent_inc(lock_slowpath);
 
 	for (;; waitcnt++) {
 		/*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 			}
 		}
 		WRITE_ONCE(pn->state, vcpu_hashed);
-		qstat_inc(qstat_pv_wait_head, true);
-		qstat_inc(qstat_pv_wait_again, waitcnt);
+		lockevent_inc(pv_wait_head);
+		lockevent_cond_inc(pv_wait_again, waitcnt);
 		pv_wait(&lock->locked, _Q_SLOW_VAL);
 
 		/*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
-	qstat_inc(qstat_pv_kick_unlock, true);
+	lockevent_inc(pv_kick_unlock);
 	pv_kick(node->cpu);
 }
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..1db5b375fcf4 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -38,8 +38,8 @@
  * Subtracting lock_use_node[234] from lock_slowpath will give you
  * lock_use_node1.
  *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
+ * Writing to the special ".reset_counts" file will reset all the above
+ * counter values.
  *
  * These statistical counters are implemented as per-cpu variables which are
  * summed and computed whenever the corresponding debugfs files are read. This
@@ -48,27 +48,7 @@
  *
  * There may be slight difference between pv_kick_wake and pv_kick_unlock.
  */
-enum qlock_stats {
-	qstat_pv_hash_hops,
-	qstat_pv_kick_unlock,
-	qstat_pv_kick_wake,
-	qstat_pv_latency_kick,
-	qstat_pv_latency_wake,
-	qstat_pv_lock_stealing,
-	qstat_pv_spurious_wakeup,
-	qstat_pv_wait_again,
-	qstat_pv_wait_early,
-	qstat_pv_wait_head,
-	qstat_pv_wait_node,
-	qstat_lock_pending,
-	qstat_lock_slowpath,
-	qstat_lock_use_node2,
-	qstat_lock_use_node3,
-	qstat_lock_use_node4,
-	qstat_lock_no_node,
-	qstat_num,	/* Total number of statistical counters */
-	qstat_reset_cnts = qstat_num,
-};
+#include "lock_events.h"
 
 #ifdef CONFIG_QUEUED_LOCK_STAT
 /*
@@ -79,99 +59,91 @@ enum qlock_stats {
 #include <linux/sched/clock.h>
 #include <linux/fs.h>
 
-static const char * const qstat_names[qstat_num + 1] = {
-	[qstat_pv_hash_hops]	   = "pv_hash_hops",
-	[qstat_pv_kick_unlock]     = "pv_kick_unlock",
-	[qstat_pv_kick_wake]       = "pv_kick_wake",
-	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
-	[qstat_pv_latency_kick]	   = "pv_latency_kick",
-	[qstat_pv_latency_wake]    = "pv_latency_wake",
-	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
-	[qstat_pv_wait_again]      = "pv_wait_again",
-	[qstat_pv_wait_early]      = "pv_wait_early",
-	[qstat_pv_wait_head]       = "pv_wait_head",
-	[qstat_pv_wait_node]       = "pv_wait_node",
-	[qstat_lock_pending]       = "lock_pending",
-	[qstat_lock_slowpath]      = "lock_slowpath",
-	[qstat_lock_use_node2]	   = "lock_use_node2",
-	[qstat_lock_use_node3]	   = "lock_use_node3",
-	[qstat_lock_use_node4]	   = "lock_use_node4",
-	[qstat_lock_no_node]	   = "lock_no_node",
-	[qstat_reset_cnts]         = "reset_counters",
+#define EVENT_COUNT(ev)	lockevents[LOCKEVENT_ ## ev]
+
+#undef  LOCK_EVENT
+#define LOCK_EVENT(name)	[LOCKEVENT_ ## name] = #name,
+
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+	[LOCKEVENT_reset_cnts] = ".reset_counts",
 };
 
 /*
  * Per-cpu counters
  */
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
 static DEFINE_PER_CPU(u64, pv_kick_time);
 
 /*
  * Function to read and return the qlock statistical counter values
  *
  * The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
  *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
  *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
  *    Average hops/hash = pv_hash_hops/pv_kick_unlock
  */
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
-			  size_t count, loff_t *ppos)
+static ssize_t lockevent_read(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
 {
 	char buf[64];
-	int cpu, counter, len;
-	u64 stat = 0, kicks = 0;
+	int cpu, id, len;
+	u64 sum = 0, kicks = 0;
 
 	/*
 	 * Get the counter ID stored in file->f_inode->i_private
 	 */
-	counter = (long)file_inode(file)->i_private;
+	id = (long)file_inode(file)->i_private;
 
-	if (counter >= qstat_num)
+	if (id >= lockevent_num)
 		return -EBADF;
 
 	for_each_possible_cpu(cpu) {
-		stat += per_cpu(qstats[counter], cpu);
+		sum += per_cpu(lockevents[id], cpu);
 		/*
-		 * Need to sum additional counter for some of them
+		 * Need to sum additional counters for some of them
 		 */
-		switch (counter) {
+		switch (id) {
 
-		case qstat_pv_latency_kick:
-		case qstat_pv_hash_hops:
-			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+		case LOCKEVENT_pv_latency_kick:
+		case LOCKEVENT_pv_hash_hops:
+			kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
 			break;
 
-		case qstat_pv_latency_wake:
-			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+		case LOCKEVENT_pv_latency_wake:
+			kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
 			break;
 		}
 	}
 
-	if (counter == qstat_pv_hash_hops) {
+	if (id == LOCKEVENT_pv_hash_hops) {
 		u64 frac = 0;
 
 		if (kicks) {
-			frac = 100ULL * do_div(stat, kicks);
+			frac = 100ULL * do_div(sum, kicks);
 			frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
 		}
 
 		/*
 		 * Return a X.XX decimal number
 		 */
-		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+			       sum, frac);
 	} else {
 		/*
 		 * Round to the nearest ns
 		 */
-		if ((counter == qstat_pv_latency_kick) ||
-		    (counter == qstat_pv_latency_wake)) {
+		if ((id == LOCKEVENT_pv_latency_kick) ||
+		    (id == LOCKEVENT_pv_latency_wake)) {
 			if (kicks)
-				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+				sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
 		}
-		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
 	}
 
 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
@@ -180,11 +152,9 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
 /*
  * Function to handle write request
  *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
+ * When id = .reset_cnts, reset all the counter values.
  */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
 	int cpu;
@@ -192,14 +162,14 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
 	/*
 	 * Get the counter ID stored in file->f_inode->i_private
 	 */
-	if ((long)file_inode(file)->i_private != qstat_reset_cnts)
+	if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
 		return count;
 
 	for_each_possible_cpu(cpu) {
 		int i;
-		unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+		unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
 
-		for (i = 0 ; i < qstat_num; i++)
+		for (i = 0 ; i < lockevent_num; i++)
 			WRITE_ONCE(ptr[i], 0);
 	}
 	return count;
@@ -208,9 +178,9 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
 /*
  * Debugfs data structures
  */
-static const struct file_operations fops_qstat = {
-	.read = qstat_read,
-	.write = qstat_write,
+static const struct file_operations fops_lockevent = {
+	.read = lockevent_read,
+	.write = lockevent_write,
 	.llseek = default_llseek,
 };
 
@@ -219,10 +189,10 @@ static const struct file_operations fops_qstat = {
  */
 static int __init init_qspinlock_stat(void)
 {
-	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+	struct dentry *d_counts = debugfs_create_dir("qlockstat", NULL);
 	int i;
 
-	if (!d_qstat)
+	if (!d_counts)
 		goto out;
 
 	/*
@@ -232,39 +202,31 @@ static int __init init_qspinlock_stat(void)
 	 * root is allowed to do the read/write to limit impact to system
 	 * performance.
 	 */
-	for (i = 0; i < qstat_num; i++)
-		if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
-					 (void *)(long)i, &fops_qstat))
+	for (i = 0; i < lockevent_num; i++)
+		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+					 (void *)(long)i, &fops_lockevent))
 			goto fail_undo;
 
-	if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
-				 (void *)(long)qstat_reset_cnts, &fops_qstat))
+	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+				 &fops_lockevent))
 		goto fail_undo;
 
 	return 0;
 fail_undo:
-	debugfs_remove_recursive(d_qstat);
+	debugfs_remove_recursive(d_counts);
 out:
 	pr_warn("Could not create 'qlockstat' debugfs entries\n");
 	return -ENOMEM;
 }
 fs_initcall(init_qspinlock_stat);
 
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
-	if (cond)
-		this_cpu_inc(qstats[stat]);
-}
-
 /*
  * PV hash hop count
  */
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
 {
-	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+	this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
 }
 
 /*
@@ -276,7 +238,7 @@ static inline void __pv_kick(int cpu)
 
 	per_cpu(pv_kick_time, cpu) = start;
 	pv_kick(cpu);
-	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+	this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
 }
 
 /*
@@ -289,9 +251,9 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 	*pkick_time = 0;
 	pv_wait(ptr, val);
 	if (*pkick_time) {
-		this_cpu_add(qstats[qstat_pv_latency_wake],
+		this_cpu_add(EVENT_COUNT(pv_latency_wake),
 			     sched_clock() - *pkick_time);
-		qstat_inc(qstat_pv_kick_wake, true);
+		lockevent_inc(pv_kick_wake);
 	}
 }
 
@@ -300,7 +262,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 
 #else /* CONFIG_QUEUED_LOCK_STAT */
 
-static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
-static inline void qstat_hop(int hopcnt)			{ }
+static inline void lockevent_pv_hop(int hopcnt)	{ }
 
 #endif /* CONFIG_QUEUED_LOCK_STAT */
-- 
cgit v1.2.3


From fb346fd9fc081c3d978c3f3d26d39334527a2662 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:17 -0400
Subject: locking/lock_events: Make lock_events available for all archs & other
 locks

The QUEUED_LOCK_STAT option to report queued spinlocks event counts
was previously allowed only on x86 architecture. To make the locking
event counting code more useful, it is now renamed to a more generic
LOCK_EVENT_COUNTS config option. This new option will be available to
all the architectures that use qspinlock at the moment.

Other locking code can now start to use the generic locking event
counting code by including lock_events.h and put the new locking event
names into the lock_events_list.h header file.

My experience with lock event counting is that it gives valuable insight
on how the locking code works and what can be done to make it better. I
would like to extend this benefit to other locking code like mutex and
rwsem in the near future.

The PV qspinlock specific code will stay in qspinlock_stat.h. The
locking event counters will now reside in the <debugfs>/lock_event_counts
directory.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-9-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/Makefile         |   1 +
 kernel/locking/lock_events.c    | 153 ++++++++++++++++++++++++++++++++++++++++
 kernel/locking/lock_events.h    |  10 ++-
 kernel/locking/qspinlock_stat.h | 141 ++++--------------------------------
 4 files changed, 173 insertions(+), 132 deletions(-)
 create mode 100644 kernel/locking/lock_events.c

(limited to 'kernel')

diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 1af83e9ce57d..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..71c36d1fb834
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#include "lock_events.h"
+
+#undef  LOCK_EVENT
+#define LOCK_EVENT(name)	[LOCKEVENT_ ## name] = #name,
+
+#define LOCK_EVENTS_DIR		"lock_event_counts"
+
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+	[LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	char buf[64];
+	int cpu, id, len;
+	u64 sum = 0;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	id = (long)file_inode(file)->i_private;
+
+	if (id >= lockevent_num)
+		return -EBADF;
+
+	for_each_possible_cpu(cpu)
+		sum += per_cpu(lockevents[id], cpu);
+	len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	int cpu;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+		return count;
+
+	for_each_possible_cpu(cpu) {
+		int i;
+		unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+
+		for (i = 0 ; i < lockevent_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+	}
+	return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+	.read = lockevent_read,
+	.write = lockevent_write,
+	.llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+	struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+	int i;
+
+	if (!d_counts)
+		goto out;
+
+	/*
+	 * Create the debugfs files
+	 *
+	 * As reading from and writing to the stat files can be slow, only
+	 * root is allowed to do the read/write to limit impact to system
+	 * performance.
+	 */
+	for (i = 0; i < lockevent_num; i++)
+		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+					 (void *)(long)i, &fops_lockevent))
+			goto fail_undo;
+
+	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+				 &fops_lockevent))
+		goto fail_undo;
+
+	return 0;
+fail_undo:
+	debugfs_remove_recursive(d_counts);
+out:
+	pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+	return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 4009e07b474a..feb1acc54611 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -13,6 +13,9 @@
  * Authors: Waiman Long <longman@redhat.com>
  */
 
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+
 enum lock_events {
 
 #include "lock_events_list.h"
@@ -21,7 +24,7 @@ enum lock_events {
 	LOCKEVENT_reset_cnts = lockevent_num,
 };
 
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
 /*
  * Per-cpu counters
  */
@@ -46,10 +49,11 @@ static inline void __lockevent_add(enum lock_events event, int inc)
 
 #define lockevent_add(ev, c)	__lockevent_add(LOCKEVENT_ ##ev, c)
 
-#else  /* CONFIG_QUEUED_LOCK_STAT */
+#else  /* CONFIG_LOCK_EVENT_COUNTS */
 
 #define lockevent_inc(ev)
 #define lockevent_add(ev, c)
 #define lockevent_cond_inc(ev, c)
 
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 1db5b375fcf4..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,76 +9,29 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
  */
 
-/*
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- *   pv_hash_hops	- average # of hops per hashing operation
- *   pv_kick_unlock	- # of vCPU kicks issued at unlock time
- *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
- *   pv_latency_kick	- average latency (ns) of vCPU kick operation
- *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
- *   pv_lock_stealing	- # of lock stealing operations
- *   pv_spurious_wakeup	- # of spurious wakeups in non-head vCPUs
- *   pv_wait_again	- # of wait's after a queue head vCPU kick
- *   pv_wait_early	- # of early vCPU wait's
- *   pv_wait_head	- # of vCPU wait's at the queue head
- *   pv_wait_node	- # of vCPU wait's at a non-head queue node
- *   lock_pending	- # of locking operations via pending code
- *   lock_slowpath	- # of locking operations via MCS lock queue
- *   lock_use_node2	- # of locking operations that use 2nd per-CPU node
- *   lock_use_node3	- # of locking operations that use 3rd per-CPU node
- *   lock_use_node4	- # of locking operations that use 4th per-CPU node
- *   lock_no_node	- # of locking operations without using per-CPU node
- *
- * Subtracting lock_use_node[234] from lock_slowpath will give you
- * lock_use_node1.
- *
- * Writing to the special ".reset_counts" file will reset all the above
- * counter values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
 #include "lock_events.h"
 
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 /*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
  */
-#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/fs.h>
 
 #define EVENT_COUNT(ev)	lockevents[LOCKEVENT_ ## ev]
 
-#undef  LOCK_EVENT
-#define LOCK_EVENT(name)	[LOCKEVENT_ ## name] = #name,
-
-static const char * const lockevent_names[lockevent_num + 1] = {
-
-#include "lock_events_list.h"
-
-	[LOCKEVENT_reset_cnts] = ".reset_counts",
-};
-
 /*
- * Per-cpu counters
+ * PV specific per-cpu counter
  */
-DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
 static DEFINE_PER_CPU(u64, pv_kick_time);
 
 /*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
  *
  * The following counters are handled specially:
  * 1. pv_latency_kick
@@ -88,8 +41,8 @@ static DEFINE_PER_CPU(u64, pv_kick_time);
  * 3. pv_hash_hops
  *    Average hops/hash = pv_hash_hops/pv_kick_unlock
  */
-static ssize_t lockevent_read(struct file *file, char __user *user_buf,
-			      size_t count, loff_t *ppos)
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+		       size_t count, loff_t *ppos)
 {
 	char buf[64];
 	int cpu, id, len;
@@ -149,78 +102,6 @@ static ssize_t lockevent_read(struct file *file, char __user *user_buf,
 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 }
 
-/*
- * Function to handle write request
- *
- * When id = .reset_cnts, reset all the counter values.
- */
-static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
-			   size_t count, loff_t *ppos)
-{
-	int cpu;
-
-	/*
-	 * Get the counter ID stored in file->f_inode->i_private
-	 */
-	if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
-		return count;
-
-	for_each_possible_cpu(cpu) {
-		int i;
-		unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
-
-		for (i = 0 ; i < lockevent_num; i++)
-			WRITE_ONCE(ptr[i], 0);
-	}
-	return count;
-}
-
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_lockevent = {
-	.read = lockevent_read,
-	.write = lockevent_write,
-	.llseek = default_llseek,
-};
-
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
-	struct dentry *d_counts = debugfs_create_dir("qlockstat", NULL);
-	int i;
-
-	if (!d_counts)
-		goto out;
-
-	/*
-	 * Create the debugfs files
-	 *
-	 * As reading from and writing to the stat files can be slow, only
-	 * root is allowed to do the read/write to limit impact to system
-	 * performance.
-	 */
-	for (i = 0; i < lockevent_num; i++)
-		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
-					 (void *)(long)i, &fops_lockevent))
-			goto fail_undo;
-
-	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
-				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
-				 &fops_lockevent))
-		goto fail_undo;
-
-	return 0;
-fail_undo:
-	debugfs_remove_recursive(d_counts);
-out:
-	pr_warn("Could not create 'qlockstat' debugfs entries\n");
-	return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-
 /*
  * PV hash hop count
  */
@@ -260,8 +141,10 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 #define pv_kick(c)	__pv_kick(c)
 #define pv_wait(p, v)	__pv_wait(p, v)
 
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
 
 static inline void lockevent_pv_hop(int hopcnt)	{ }
 
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
-- 
cgit v1.2.3


From bf20616f46e536fe8affed6f138db4b3040b55a6 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:18 -0400
Subject: locking/lock_events: Don't show pvqspinlock events on bare metal

On bare metal, the pvqspinlock event counts will always be 0. So there
is no point in showing their corresponding debugfs files. So they are
skipped in this case.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-10-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lock_events.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index 71c36d1fb834..fa2c2f951c6b 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -115,6 +115,29 @@ static const struct file_operations fops_lockevent = {
 	.llseek = default_llseek,
 };
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+
+static bool __init skip_lockevent(const char *name)
+{
+	static int pv_on __initdata = -1;
+
+	if (pv_on < 0)
+		pv_on = !pv_is_native_spin_unlock();
+	/*
+	 * Skip PV qspinlock events on bare metal.
+	 */
+	if (!pv_on && !memcmp(name, "pv_", 3))
+		return true;
+	return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+	return false;
+}
+#endif
+
 /*
  * Initialize debugfs for the locking event counts.
  */
@@ -133,10 +156,13 @@ static int __init init_lockevent_counts(void)
 	 * root is allowed to do the read/write to limit impact to system
 	 * performance.
 	 */
-	for (i = 0; i < lockevent_num; i++)
+	for (i = 0; i < lockevent_num; i++) {
+		if (skip_lockevent(lockevent_names[i]))
+			continue;
 		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
 					 (void *)(long)i, &fops_lockevent))
 			goto fail_undo;
+	}
 
 	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
 				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
-- 
cgit v1.2.3


From a8654596f0371c2604c4d475422c48f4fc6a56c9 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2019 13:43:19 -0400
Subject: locking/rwsem: Enable lock event counting

Add lock event counting calls so that we can track the number of lock
events happening in the rwsem code.

With CONFIG_LOCK_EVENT_COUNTS on and booting a 4-socket 112-thread x86-64
system, the rwsem counts after system bootup were as follows:

  rwsem_opt_fail=261
  rwsem_opt_wlock=50636
  rwsem_rlock=445
  rwsem_rlock_fail=0
  rwsem_rlock_fast=22
  rwsem_rtrylock=810144
  rwsem_sleep_reader=441
  rwsem_sleep_writer=310
  rwsem_wake_reader=355
  rwsem_wake_writer=2335
  rwsem_wlock=261
  rwsem_wlock_fail=0
  rwsem_wtrylock=20583

It can be seen that most of the lock acquisitions in the slowpath were
write-locks in the optimistic spinning code path with no sleeping at
all. For this system, over 97% of the locks are acquired via optimistic
spinning. It illustrates the importance of optimistic spinning in
improving the performance of rwsem.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20190404174320.22416-11-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lock_events_list.h | 17 +++++++++++++++++
 kernel/locking/rwsem-xadd.c       | 11 +++++++++++
 kernel/locking/rwsem.h            |  4 ++++
 3 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 8b4d2e180475..ad7668cfc9da 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -48,3 +48,20 @@ LOCK_EVENT(lock_use_node3)	/* # of locking ops that use 3rd percpu node */
 LOCK_EVENT(lock_use_node4)	/* # of locking ops that use 4th percpu node */
 LOCK_EVENT(lock_no_node)	/* # of locking ops w/o using percpu node    */
 #endif /* CONFIG_QUEUED_SPINLOCKS */
+
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader)	/* # of reader sleeps			*/
+LOCK_EVENT(rwsem_sleep_writer)	/* # of writer sleeps			*/
+LOCK_EVENT(rwsem_wake_reader)	/* # of reader wakeups			*/
+LOCK_EVENT(rwsem_wake_writer)	/* # of writer wakeups			*/
+LOCK_EVENT(rwsem_opt_wlock)	/* # of write locks opt-spin acquired	*/
+LOCK_EVENT(rwsem_opt_fail)	/* # of failed opt-spinnings		*/
+LOCK_EVENT(rwsem_rlock)		/* # of read locks acquired		*/
+LOCK_EVENT(rwsem_rlock_fast)	/* # of fast read locks acquired	*/
+LOCK_EVENT(rwsem_rlock_fail)	/* # of failed read lock acquisitions	*/
+LOCK_EVENT(rwsem_rtrylock)	/* # of read trylock calls		*/
+LOCK_EVENT(rwsem_wlock)		/* # of write locks acquired		*/
+LOCK_EVENT(rwsem_wlock_fail)	/* # of failed write lock acquisitions	*/
+LOCK_EVENT(rwsem_wtrylock)	/* # of write trylock calls		*/
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index f6198e1a58f6..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 			 * will notice the queued writer.
 			 */
 			wake_q_add(wake_q, waiter->task);
+			lockevent_inc(rwsem_wake_writer);
 		}
 
 		return;
@@ -214,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 	}
 
 	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+	lockevent_cond_inc(rwsem_wake_reader, woken);
 	if (list_empty(&sem->wait_list)) {
 		/* hit end of list above */
 		adjustment -= RWSEM_WAITING_BIAS;
@@ -265,6 +267,7 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
 					count + RWSEM_ACTIVE_WRITE_BIAS)) {
 			rwsem_set_owner(sem);
+			lockevent_inc(rwsem_opt_wlock);
 			return true;
 		}
 	}
@@ -389,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 	osq_unlock(&sem->osq);
 done:
 	preempt_enable();
+	lockevent_cond_inc(rwsem_opt_fail, !taken);
 	return taken;
 }
 
@@ -436,6 +440,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 		if (atomic_long_read(&sem->count) >= 0) {
 			raw_spin_unlock_irq(&sem->wait_lock);
 			rwsem_set_reader_owned(sem);
+			lockevent_inc(rwsem_rlock_fast);
 			return sem;
 		}
 		adjustment += RWSEM_WAITING_BIAS;
@@ -472,9 +477,11 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 			break;
 		}
 		schedule();
+		lockevent_inc(rwsem_sleep_reader);
 	}
 
 	__set_current_state(TASK_RUNNING);
+	lockevent_inc(rwsem_rlock);
 	return sem;
 out_nolock:
 	list_del(&waiter.list);
@@ -482,6 +489,7 @@ out_nolock:
 		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	__set_current_state(TASK_RUNNING);
+	lockevent_inc(rwsem_rlock_fail);
 	return ERR_PTR(-EINTR);
 }
 
@@ -575,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 				goto out_nolock;
 
 			schedule();
+			lockevent_inc(rwsem_sleep_writer);
 			set_current_state(state);
 		} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
 
@@ -583,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 	__set_current_state(TASK_RUNNING);
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
+	lockevent_inc(rwsem_wlock);
 
 	return ret;
 
@@ -596,6 +606,7 @@ out_nolock:
 		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	wake_up_q(&wake_q);
+	lockevent_inc(rwsem_wlock_fail);
 
 	return ERR_PTR(-EINTR);
 }
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 3059a2dc39f8..37db17890e36 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,6 +23,8 @@
  * is involved. Ideally we would like to track all the readers that own
  * a rwsem, but the overhead is simply too big.
  */
+#include "lock_events.h"
+
 #define RWSEM_READER_OWNED	(1UL << 0)
 #define RWSEM_ANONYMOUSLY_OWNED	(1UL << 1)
 
@@ -200,6 +202,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	 */
 	long tmp = RWSEM_UNLOCKED_VALUE;
 
+	lockevent_inc(rwsem_rtrylock);
 	do {
 		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 					tmp + RWSEM_ACTIVE_READ_BIAS)) {
@@ -241,6 +244,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
+	lockevent_inc(rwsem_wtrylock);
 	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
 	if (tmp == RWSEM_UNLOCKED_VALUE) {
-- 
cgit v1.2.3


From ee6a6500fe1f5c5a3f18de33fe0178a3c627f6d0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 10 Apr 2019 10:45:38 -0400
Subject: ftrace: Remove ASSIGN_OPS_HASH() macro from ftrace.c

The ASSIGN_OPS_HASH() macro was moved to fgraph.c where it was used, but for
some reason it wasn't removed from ftrace.c, as it is no longer referenced
there.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c8ca9bd06b..bf11e0553450 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -69,12 +69,8 @@
 #define INIT_OPS_HASH(opsname)	\
 	.func_hash		= &opsname.local_hash,			\
 	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#define ASSIGN_OPS_HASH(opsname, val) \
-	.func_hash		= val, \
-	.local_hash.regex_lock	= __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
 #else
 #define INIT_OPS_HASH(opsname)
-#define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
 enum {
-- 
cgit v1.2.3


From 83ca259489409a1fe8a83dad83a82f32174d4f31 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Fri, 5 Apr 2019 09:15:25 +0800
Subject: swiotlb: dump used and total slots when swiotlb buffer is full

So far the kernel only prints the requested size if swiotlb buffer if full.
It is not possible to know whether it is simply an out of buffer, or it is
because swiotlb cannot allocate buffer with the requested size due to
fragmentation.

As 'io_tlb_used' is available since commit 71602fe6d4e9 ("swiotlb: add
debugfs to track swiotlb buffer usage"), both 'io_tlb_used' and
'io_tlb_nslabs' are printed when swiotlb buffer is full.

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 kernel/dma/swiotlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2b0c8fd9658e..82c767374c70 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -533,7 +533,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 not_found:
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
-		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
+		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n",
+			 size, io_tlb_nslabs, io_tlb_used);
 	return DMA_MAPPING_ERROR;
 found:
 	io_tlb_used += nslots;
-- 
cgit v1.2.3


From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 9 Apr 2019 11:49:09 -0700
Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN

Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN:
* ctx_in/ctx_size_in - input context
* ctx_out/ctx_size_out - output context

The intended use case is to pass some meta data to the test runs that
operate on skb (this has being brought up on recent LPC).

For programs that use bpf_prog_test_run_skb, support __sk_buff input and
output. Initially, from input __sk_buff, copy _only_ cb and priority into
skb, all other non-zero fields are prohibited (with EINVAL).
If the user has set ctx_out/ctx_size_out, copy the potentially modified
__sk_buff back to the userspace.

We require all fields of input __sk_buff except the ones we explicitly
support to be set to zero. The expectation is that in the future we might
add support for more fields and we want to fail explicitly if the user
runs the program on the kernel where we don't yet support them.

The API is intentionally vague (i.e. we don't explicitly add __sk_buff
to bpf_attr, but ctx_in) to potentially let other test_run types use
this interface in the future (this can be xdp_md for xdp types for
example).

v4:
  * don't copy more than allowed in bpf_ctx_init [Martin]

v3:
  * handle case where ctx_in is NULL, but ctx_out is not [Martin]
  * convert size==0 checks to ptr==NULL checks and add some extra ptr
    checks [Martin]

v2:
  * Addressed comments from Martin Lau

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 438199e2eca4..d995eedfdd16 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2009,7 +2009,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	return cgroup_bpf_prog_query(attr, uattr);
 }
 
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
 
 static int bpf_prog_test_run(const union bpf_attr *attr,
 			     union bpf_attr __user *uattr)
@@ -2022,6 +2022,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
 
+	if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
+	    (!attr->test.ctx_size_in && attr->test.ctx_in))
+		return -EINVAL;
+
+	if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
+	    (!attr->test.ctx_size_out && attr->test.ctx_out))
+		return -EINVAL;
+
 	prog = bpf_prog_get(attr->test.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
-- 
cgit v1.2.3


From 2fa717a0337e7acafda9283c938b635191b8036b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 11 Apr 2019 11:46:13 -0400
Subject: ftrace: Do not process STUB functions in ftrace_ops_list_func()

The function_graph tracer has a stub function and its ops flag has the
FTRACE_OPS_FL_STUB set. As the function graph does not use the
ftrace_ops->func pointer but instead is called by a separate part of the
ftrace trampoline. The function_graph tracer still requires to pass in a
ftrace_ops that may also hold the hash of the functions to call. But there's
no reason to test that hash in the function tracing portion. Instead of
testing to see if we should call the stub function, just test if the ops has
FTRACE_OPS_FL_STUB set, and just skip it.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf11e0553450..433a64f49532 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6260,6 +6260,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 
 	do_for_each_ftrace_op(op, ftrace_ops_list) {
+		/* Stub functions don't need to be called nor tested */
+		if (op->flags & FTRACE_OPS_FL_STUB)
+			continue;
 		/*
 		 * Check the following for each ops before calling their func:
 		 *  if RCU flag is set, then rcu_is_watching() must be true
-- 
cgit v1.2.3


From b1cd609d9b517f01867c211bd520cc805db3068a Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 12 Mar 2019 09:27:09 -0700
Subject: bpf: Add base proto function for cgroup-bpf programs

Currently kernel/bpf/cgroup.c contains only one program type and one
proto function cgroup_dev_func_proto(). It'd be useful to have base
proto function that can be reused for new cgroup-bpf program types
coming soon.

Introduce cgroup_base_func_proto().

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4e807973aa80..f6cd38746df2 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -701,7 +701,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 
 static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -725,6 +725,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return cgroup_base_func_proto(func_id, prog);
+}
+
 static bool cgroup_dev_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       const struct bpf_prog *prog,
-- 
cgit v1.2.3


From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 27 Feb 2019 12:59:24 -0800
Subject: bpf: Sysctl hook

Containerized applications may run as root and it may create problems
for whole host. Specifically such applications may change a sysctl and
affect applications in other containers.

Furthermore in existing infrastructure it may not be possible to just
completely disable writing to sysctl, instead such a process should be
gradual with ability to log what sysctl are being changed by a
container, investigate, limit the set of writable sysctl to currently
used ones (so that new ones can not be changed) and eventually reduce
this set to zero.

The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and
attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis.

New program type has access to following minimal context:
	struct bpf_sysctl {
		__u32	write;
	};

Where @write indicates whether sysctl is being read (= 0) or written (=
1).

Helpers to access sysctl name and value will be introduced separately.

BPF_CGROUP_SYSCTL attach point is added to sysctl code right before
passing control to ctl_table->proc_handler so that BPF program can
either allow or deny access to sysctl.

Suggested-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c   | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  7 ++++
 kernel/bpf/verifier.c |  1 +
 3 files changed, 100 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f6cd38746df2..610491b5f0aa 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -11,7 +11,9 @@
 #include <linux/kernel.h>
 #include <linux/atomic.h>
 #include <linux/cgroup.h>
+#include <linux/filter.h>
 #include <linux/slab.h>
+#include <linux/sysctl.h>
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
@@ -768,3 +770,93 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
 	.get_func_proto		= cgroup_dev_func_proto,
 	.is_valid_access	= cgroup_dev_is_valid_access,
 };
+
+/**
+ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
+ *
+ * @head: sysctl table header
+ * @table: sysctl table
+ * @write: sysctl is being read (= 0) or written (= 1)
+ * @type: type of program to be executed
+ *
+ * Program is run when sysctl is being accessed, either read or written, and
+ * can allow or deny such access.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
+				   struct ctl_table *table, int write,
+				   enum bpf_attach_type type)
+{
+	struct bpf_sysctl_kern ctx = {
+		.head = head,
+		.table = table,
+		.write = write,
+	};
+	struct cgroup *cgrp;
+	int ret;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+
+static const struct bpf_func_proto *
+sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return cgroup_base_func_proto(func_id, prog);
+}
+
+static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
+				   const struct bpf_prog *prog,
+				   struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off + size > sizeof(struct bpf_sysctl) ||
+	    off % size || type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_sysctl, write):
+		bpf_ctx_record_field_size(info, size_default);
+		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	default:
+		return false;
+	}
+}
+
+static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
+				     const struct bpf_insn *si,
+				     struct bpf_insn *insn_buf,
+				     struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sysctl, write):
+		*insn++ = BPF_LDX_MEM(
+			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+			bpf_target_off(struct bpf_sysctl_kern, write,
+				       FIELD_SIZEOF(struct bpf_sysctl_kern,
+						    write),
+				       target_size));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
+	.get_func_proto		= sysctl_func_proto,
+	.is_valid_access	= sysctl_is_valid_access,
+	.convert_ctx_access	= sysctl_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sysctl_prog_ops = {
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d995eedfdd16..92c9b8a32b50 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1888,6 +1888,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_FLOW_DISSECTOR:
 		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
 		break;
+	case BPF_CGROUP_SYSCTL:
+		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1966,6 +1969,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return lirc_prog_detach(attr);
 	case BPF_FLOW_DISSECTOR:
 		return skb_flow_dissector_bpf_prog_detach(attr);
+	case BPF_CGROUP_SYSCTL:
+		ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1999,6 +2005,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
+	case BPF_CGROUP_SYSCTL:
 		break;
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f25b7c9c20ba..20808e3c95a8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5267,6 +5267,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
+	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 		break;
 	default:
 		return 0;
-- 
cgit v1.2.3


From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 27 Feb 2019 13:28:48 -0800
Subject: bpf: Introduce bpf_sysctl_get_name helper

Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry)
into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer.

By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem".

If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied,
e.g. "tcp_mem".

Documentation for the new helper is provided in bpf.h UAPI.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 610491b5f0aa..a68387043244 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,7 @@
 #include <linux/filter.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/string.h>
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
@@ -806,10 +807,77 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 
+static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
+			      size_t *lenp)
+{
+	ssize_t tmp_ret = 0, ret;
+
+	if (dir->header.parent) {
+		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
+		if (tmp_ret < 0)
+			return tmp_ret;
+	}
+
+	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
+	if (ret < 0)
+		return ret;
+	*bufp += ret;
+	*lenp -= ret;
+	ret += tmp_ret;
+
+	/* Avoid leading slash. */
+	if (!ret)
+		return ret;
+
+	tmp_ret = strscpy(*bufp, "/", *lenp);
+	if (tmp_ret < 0)
+		return tmp_ret;
+	*bufp += tmp_ret;
+	*lenp -= tmp_ret;
+
+	return ret + tmp_ret;
+}
+
+BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
+	   size_t, buf_len, u64, flags)
+{
+	ssize_t tmp_ret = 0, ret;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
+		if (!ctx->head)
+			return -EINVAL;
+		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
+		if (tmp_ret < 0)
+			return tmp_ret;
+	}
+
+	ret = strscpy(buf, ctx->table->procname, buf_len);
+
+	return ret < 0 ? ret : tmp_ret + ret;
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
+	.func		= bpf_sysctl_get_name,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return cgroup_base_func_proto(func_id, prog);
+	switch (func_id) {
+	case BPF_FUNC_sysctl_get_name:
+		return &bpf_sysctl_get_name_proto;
+	default:
+		return cgroup_base_func_proto(func_id, prog);
+	}
 }
 
 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
-- 
cgit v1.2.3


From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 28 Feb 2019 19:22:15 -0800
Subject: bpf: Introduce bpf_sysctl_get_current_value helper

Add bpf_sysctl_get_current_value() helper to copy current sysctl value
into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer.

It provides same string as user space can see by reading corresponding
file in /proc/sys/, including new line, etc.

Documentation for the new helper is provided in bpf.h UAPI.

Since current value is kept in ctl_table->data in a parsed form,
ctl_table->proc_handler() with write=0 is called to read that data and
convert it to a string. Such a string can later be parsed by a program
using helpers that will be introduced separately.

Unfortunately it's not trivial to provide API to access parsed data due to
variety of data representations (string, intvec, uintvec, ulongvec,
custom structures, even NULL, etc). Instead it's assumed that user know
how to handle specific sysctl they're interested in and appropriate
helpers can be used.

Since ctl_table->proc_handler() expects __user buffer, conversion to
__user happens for kernel allocated one where the value is stored.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a68387043244..c6b2cf29a54b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -794,15 +794,37 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		.head = head,
 		.table = table,
 		.write = write,
+		.cur_val = NULL,
+		.cur_len = PAGE_SIZE,
 	};
 	struct cgroup *cgrp;
 	int ret;
 
+	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
+	if (ctx.cur_val) {
+		mm_segment_t old_fs;
+		loff_t pos = 0;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
+					&ctx.cur_len, &pos)) {
+			/* Let BPF program decide how to proceed. */
+			ctx.cur_len = 0;
+		}
+		set_fs(old_fs);
+	} else {
+		/* Let BPF program decide how to proceed. */
+		ctx.cur_len = 0;
+	}
+
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 	rcu_read_unlock();
 
+	kfree(ctx.cur_val);
+
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
@@ -869,12 +891,55 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
+			     size_t src_len)
+{
+	if (!dst)
+		return -EINVAL;
+
+	if (!dst_len)
+		return -E2BIG;
+
+	if (!src || !src_len) {
+		memset(dst, 0, dst_len);
+		return -EINVAL;
+	}
+
+	memcpy(dst, src, min(dst_len, src_len));
+
+	if (dst_len > src_len) {
+		memset(dst + src_len, '\0', dst_len - src_len);
+		return src_len;
+	}
+
+	dst[dst_len - 1] = '\0';
+
+	return -E2BIG;
+}
+
+BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
+	   char *, buf, size_t, buf_len)
+{
+	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
+	.func		= bpf_sysctl_get_current_value,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sysctl_get_name:
 		return &bpf_sysctl_get_name_proto;
+	case BPF_FUNC_sysctl_get_current_value:
+		return &bpf_sysctl_get_current_value_proto;
 	default:
 		return cgroup_base_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3


From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 7 Mar 2019 18:38:43 -0800
Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers

Add helpers to work with new value being written to sysctl by user
space.

bpf_sysctl_get_new_value() copies value being written to sysctl into
provided buffer.

bpf_sysctl_set_new_value() overrides new value being written by user
space with a one from provided buffer. Buffer should contain string
representation of the value, similar to what can be seen in /proc/sys/.

Both helpers can be used only on sysctl write.

File position matters and can be managed by an interface that will be
introduced separately. E.g. if user space calls sys_write to a file in
/proc/sys/ at file position = X, where X > 0, then the value set by
bpf_sysctl_set_new_value() will be written starting from X. If program
wants to override whole value with specified buffer, file position has
to be set to zero.

Documentation for the new helpers is provided in bpf.h UAPI.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c6b2cf29a54b..ba4e21986760 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -778,6 +778,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * @head: sysctl table header
  * @table: sysctl table
  * @write: sysctl is being read (= 0) or written (= 1)
+ * @buf: pointer to buffer passed by user space
+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
+ *	result is size of @new_buf if program set new value, initial value
+ *	otherwise
+ * @new_buf: pointer to pointer to new buffer that will be allocated if program
+ *	overrides new value provided by user space on sysctl write
+ *	NOTE: it's caller responsibility to free *new_buf if it was set
  * @type: type of program to be executed
  *
  * Program is run when sysctl is being accessed, either read or written, and
@@ -788,7 +795,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  */
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   enum bpf_attach_type type)
+				   void __user *buf, size_t *pcount,
+				   void **new_buf, enum bpf_attach_type type)
 {
 	struct bpf_sysctl_kern ctx = {
 		.head = head,
@@ -796,6 +804,9 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		.write = write,
 		.cur_val = NULL,
 		.cur_len = PAGE_SIZE,
+		.new_val = NULL,
+		.new_len = 0,
+		.new_updated = 0,
 	};
 	struct cgroup *cgrp;
 	int ret;
@@ -818,6 +829,18 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		ctx.cur_len = 0;
 	}
 
+	if (write && buf && *pcount) {
+		/* BPF program should be able to override new value with a
+		 * buffer bigger than provided by user.
+		 */
+		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
+		ctx.new_len = min(PAGE_SIZE, *pcount);
+		if (!ctx.new_val ||
+		    copy_from_user(ctx.new_val, buf, ctx.new_len))
+			/* Let BPF program decide how to proceed. */
+			ctx.new_len = 0;
+	}
+
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
@@ -825,6 +848,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	kfree(ctx.cur_val);
 
+	if (ret == 1 && ctx.new_updated) {
+		*new_buf = ctx.new_val;
+		*pcount = ctx.new_len;
+	} else {
+		kfree(ctx.new_val);
+	}
+
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
@@ -932,6 +962,51 @@ static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
+	   size_t, buf_len)
+{
+	if (!ctx->write) {
+		if (buf && buf_len)
+			memset(buf, '\0', buf_len);
+		return -EINVAL;
+	}
+	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
+	.func		= bpf_sysctl_get_new_value,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
+	   const char *, buf, size_t, buf_len)
+{
+	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
+		return -EINVAL;
+
+	if (buf_len > PAGE_SIZE - 1)
+		return -E2BIG;
+
+	memcpy(ctx->new_val, buf, buf_len);
+	ctx->new_len = buf_len;
+	ctx->new_updated = 1;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
+	.func		= bpf_sysctl_set_new_value,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -940,6 +1015,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sysctl_get_name_proto;
 	case BPF_FUNC_sysctl_get_current_value:
 		return &bpf_sysctl_get_current_value_proto;
+	case BPF_FUNC_sysctl_get_new_value:
+		return &bpf_sysctl_get_new_value_proto;
+	case BPF_FUNC_sysctl_set_new_value:
+		return &bpf_sysctl_set_new_value_proto;
 	default:
 		return cgroup_base_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3


From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 7 Mar 2019 18:50:52 -0800
Subject: bpf: Add file_pos field to bpf_sysctl ctx

Add file_pos field to bpf_sysctl context to read and write sysctl file
position at which sysctl is being accessed (read or written).

The field can be used to e.g. override whole sysctl value on write to
sysctl even when sys_write is called by user space with file_pos > 0. Or
BPF program may reject such accesses.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ba4e21986760..b2adf22139b3 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -782,6 +782,9 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
  *	result is size of @new_buf if program set new value, initial value
  *	otherwise
+ * @ppos: value-result argument: value is position at which read from or write
+ *	to sysctl is happening, result is new position if program overrode it,
+ *	initial value otherwise
  * @new_buf: pointer to pointer to new buffer that will be allocated if program
  *	overrides new value provided by user space on sysctl write
  *	NOTE: it's caller responsibility to free *new_buf if it was set
@@ -796,12 +799,14 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   void __user *buf, size_t *pcount,
-				   void **new_buf, enum bpf_attach_type type)
+				   loff_t *ppos, void **new_buf,
+				   enum bpf_attach_type type)
 {
 	struct bpf_sysctl_kern ctx = {
 		.head = head,
 		.table = table,
 		.write = write,
+		.ppos = ppos,
 		.cur_val = NULL,
 		.cur_len = PAGE_SIZE,
 		.new_val = NULL,
@@ -1030,14 +1035,22 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
 {
 	const int size_default = sizeof(__u32);
 
-	if (off < 0 || off + size > sizeof(struct bpf_sysctl) ||
-	    off % size || type != BPF_READ)
+	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
 		return false;
 
 	switch (off) {
 	case offsetof(struct bpf_sysctl, write):
+		if (type != BPF_READ)
+			return false;
 		bpf_ctx_record_field_size(info, size_default);
 		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	case offsetof(struct bpf_sysctl, file_pos):
+		if (type == BPF_READ) {
+			bpf_ctx_record_field_size(info, size_default);
+			return bpf_ctx_narrow_access_ok(off, size, size_default);
+		} else {
+			return size == size_default;
+		}
 	default:
 		return false;
 	}
@@ -1059,6 +1072,41 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
 						    write),
 				       target_size));
 		break;
+	case offsetof(struct bpf_sysctl, file_pos):
+		/* ppos is a pointer so it should be accessed via indirect
+		 * loads and stores. Also for stores additional temporary
+		 * register is used since neither src_reg nor dst_reg can be
+		 * overridden.
+		 */
+		if (type == BPF_WRITE) {
+			int treg = BPF_REG_9;
+
+			if (si->src_reg == treg || si->dst_reg == treg)
+				--treg;
+			if (si->src_reg == treg || si->dst_reg == treg)
+				--treg;
+			*insn++ = BPF_STX_MEM(
+				BPF_DW, si->dst_reg, treg,
+				offsetof(struct bpf_sysctl_kern, tmp_reg));
+			*insn++ = BPF_LDX_MEM(
+				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+				treg, si->dst_reg,
+				offsetof(struct bpf_sysctl_kern, ppos));
+			*insn++ = BPF_STX_MEM(
+				BPF_SIZEOF(u32), treg, si->src_reg, 0);
+			*insn++ = BPF_LDX_MEM(
+				BPF_DW, treg, si->dst_reg,
+				offsetof(struct bpf_sysctl_kern, tmp_reg));
+		} else {
+			*insn++ = BPF_LDX_MEM(
+				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+				si->dst_reg, si->src_reg,
+				offsetof(struct bpf_sysctl_kern, ppos));
+			*insn++ = BPF_LDX_MEM(
+				BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+		}
+		*target_size = sizeof(u32);
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From 57c3bb725a3dd97d960d7e1cd0845d88de53217f Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 18 Mar 2019 16:57:10 -0700
Subject: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types

Currently the way to pass result from BPF helper to BPF program is to
provide memory area defined by pointer and size: func(void *, size_t).

It works great for generic use-case, but for simple types, such as int,
it's overkill and consumes two arguments when it could use just one.

Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be
able to pass result from helper to program via pointer to int and long
correspondingly: func(int *) or func(long *).

New argument types are similar to ARG_PTR_TO_MEM with the following
differences:
* they don't require corresponding ARG_CONST_SIZE argument, predefined
  access sizes are used instead (32bit for int, 64bit for long);
* it's possible to use more than one such an argument in a helper;
* provided pointers have to be aligned.

It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT
argument types. It's not done due to lack of use-case though.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 20808e3c95a8..15ab6fa817ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2462,6 +2462,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
 	       type == ARG_CONST_SIZE_OR_ZERO;
 }
 
+static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_INT ||
+	       type == ARG_PTR_TO_LONG;
+}
+
+static int int_ptr_type_to_size(enum bpf_arg_type type)
+{
+	if (type == ARG_PTR_TO_INT)
+		return sizeof(u32);
+	else if (type == ARG_PTR_TO_LONG)
+		return sizeof(u64);
+
+	return -EINVAL;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
@@ -2554,6 +2570,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+	} else if (arg_type_is_int_ptr(arg_type)) {
+		expected_type = PTR_TO_STACK;
+		if (!type_is_pkt_pointer(type) &&
+		    type != PTR_TO_MAP_VALUE &&
+		    type != expected_type)
+			goto err_type;
 	} else {
 		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -2635,6 +2657,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno - 1,
 					      reg->umax_value,
 					      zero_size_allowed, meta);
+	} else if (arg_type_is_int_ptr(arg_type)) {
+		int size = int_ptr_type_to_size(arg_type);
+
+		err = check_helper_mem_access(env, regno, size, false, meta);
+		if (err)
+			return err;
+		err = check_ptr_alignment(env, reg, 0, size, true);
 	}
 
 	return err;
-- 
cgit v1.2.3


From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 18 Mar 2019 17:55:26 -0700
Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers

Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned
long correspondingly. It's similar to user space strtol(3) and
strtoul(3) with a few changes to the API:

* instead of NUL-terminated C string the helpers expect buffer and
  buffer length;

* resulting long or unsigned long is returned in a separate
  result-argument;

* return value is used to indicate success or failure, on success number
  of consumed bytes is returned that can be used to identify position to
  read next if the buffer is expected to contain multiple integers;

* instead of *base* argument, *flags* is used that provides base in 5
  LSB, other bits are reserved for future use;

* number of supported bases is limited.

Documentation for the new helpers is provided in bpf.h UAPI.

The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to
be able to convert string input to e.g. "ulongvec" output.

E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be
parsed by calling to bpf_strtoul three times.

Implementation notes:

Implementation includes "../../lib/kstrtox.h" to reuse integer parsing
functions. It's done exactly same way as fs/proc/base.c already does.

Unfortunately existing kstrtoX function can't be used directly since
they fail if any invalid character is present right after integer in the
string. Existing simple_strtoX functions can't be used either since
they're obsolete and don't handle overflow properly.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c  |   4 ++
 kernel/bpf/helpers.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b2adf22139b3..789d4ab2336e 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1016,6 +1016,10 @@ static const struct bpf_func_proto *
 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
+	case BPF_FUNC_strtol:
+		return &bpf_strtol_proto;
+	case BPF_FUNC_strtoul:
+		return &bpf_strtoul_proto;
 	case BPF_FUNC_sysctl_get_name:
 		return &bpf_sysctl_get_name_proto;
 	case BPF_FUNC_sysctl_get_current_value:
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a411fc17d265..4266ffde07ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,9 @@
 #include <linux/sched.h>
 #include <linux/uidgid.h>
 #include <linux/filter.h>
+#include <linux/ctype.h>
+
+#include "../../lib/kstrtox.h"
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 #endif
+
+#define BPF_STRTOX_BASE_MASK 0x1F
+
+static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
+			  unsigned long long *res, bool *is_negative)
+{
+	unsigned int base = flags & BPF_STRTOX_BASE_MASK;
+	const char *cur_buf = buf;
+	size_t cur_len = buf_len;
+	unsigned int consumed;
+	size_t val_len;
+	char str[64];
+
+	if (!buf || !buf_len || !res || !is_negative)
+		return -EINVAL;
+
+	if (base != 0 && base != 8 && base != 10 && base != 16)
+		return -EINVAL;
+
+	if (flags & ~BPF_STRTOX_BASE_MASK)
+		return -EINVAL;
+
+	while (cur_buf < buf + buf_len && isspace(*cur_buf))
+		++cur_buf;
+
+	*is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
+	if (*is_negative)
+		++cur_buf;
+
+	consumed = cur_buf - buf;
+	cur_len -= consumed;
+	if (!cur_len)
+		return -EINVAL;
+
+	cur_len = min(cur_len, sizeof(str) - 1);
+	memcpy(str, cur_buf, cur_len);
+	str[cur_len] = '\0';
+	cur_buf = str;
+
+	cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
+	val_len = _parse_integer(cur_buf, base, res);
+
+	if (val_len & KSTRTOX_OVERFLOW)
+		return -ERANGE;
+
+	if (val_len == 0)
+		return -EINVAL;
+
+	cur_buf += val_len;
+	consumed += cur_buf - str;
+
+	return consumed;
+}
+
+static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
+			 long long *res)
+{
+	unsigned long long _res;
+	bool is_negative;
+	int err;
+
+	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+	if (err < 0)
+		return err;
+	if (is_negative) {
+		if ((long long)-_res > 0)
+			return -ERANGE;
+		*res = -_res;
+	} else {
+		if ((long long)_res < 0)
+			return -ERANGE;
+		*res = _res;
+	}
+	return err;
+}
+
+BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
+	   long *, res)
+{
+	long long _res;
+	int err;
+
+	err = __bpf_strtoll(buf, buf_len, flags, &_res);
+	if (err < 0)
+		return err;
+	if (_res != (long)_res)
+		return -ERANGE;
+	*res = _res;
+	return err;
+}
+
+const struct bpf_func_proto bpf_strtol_proto = {
+	.func		= bpf_strtol,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
+	   unsigned long *, res)
+{
+	unsigned long long _res;
+	bool is_negative;
+	int err;
+
+	err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+	if (err < 0)
+		return err;
+	if (is_negative)
+		return -EINVAL;
+	if (_res != (unsigned long)_res)
+		return -ERANGE;
+	*res = _res;
+	return err;
+}
+
+const struct bpf_func_proto bpf_strtoul_proto = {
+	.func		= bpf_strtoul,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_LONG,
+};
 #endif
-- 
cgit v1.2.3


From 51356ac89b5a15e5207e8740d5f4f8b71cb7332f Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 12 Apr 2019 16:01:01 -0700
Subject: bpf: Fix distinct pointer types warning for ARCH=i386

Fix a new warning reported by kbuild for make ARCH=i386:

   In file included from kernel/bpf/cgroup.c:11:0:
   kernel/bpf/cgroup.c: In function '__cgroup_bpf_run_filter_sysctl':
   include/linux/kernel.h:827:29: warning: comparison of distinct pointer types lacks a cast
      (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
                                ^
   include/linux/kernel.h:841:4: note: in expansion of macro '__typecheck'
      (__typecheck(x, y) && __no_side_effects(x, y))
       ^~~~~~~~~~~
   include/linux/kernel.h:851:24: note: in expansion of macro '__safe_cmp'
     __builtin_choose_expr(__safe_cmp(x, y), \
                           ^~~~~~~~~~
   include/linux/kernel.h:860:19: note: in expansion of macro '__careful_cmp'
    #define min(x, y) __careful_cmp(x, y, <)
                      ^~~~~~~~~~~~~
>> kernel/bpf/cgroup.c:837:17: note: in expansion of macro 'min'
      ctx.new_len = min(PAGE_SIZE, *pcount);
                    ^~~

Fixes: 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers")
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 789d4ab2336e..e58a6c247f56 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -839,7 +839,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		 * buffer bigger than provided by user.
 		 */
 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
-		ctx.new_len = min(PAGE_SIZE, *pcount);
+		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
 		if (!ctx.new_val ||
 		    copy_from_user(ctx.new_val, buf, ctx.new_len))
 			/* Let BPF program decide how to proceed. */
-- 
cgit v1.2.3


From 53b29c336830db48ad3dc737f88b8c065b1f0851 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Fri, 12 Apr 2019 19:38:26 +0800
Subject: swiotlb: save io_tlb_used to local variable before leaving critical
 section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When swiotlb is full, the kernel would print io_tlb_used. However, the
result might be inaccurate at that time because we have left the critical
section protected by spinlock.

Therefore, we backup the io_tlb_used into local variable before leaving
critical section.

Fixes: 83ca25948940 ("swiotlb: dump used and total slots when swiotlb buffer is full")
Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 kernel/dma/swiotlb.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 82c767374c70..38d57218809c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -445,6 +445,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	unsigned long mask;
 	unsigned long offset_slots;
 	unsigned long max_slots;
+	unsigned long tmp_io_tlb_used;
 
 	if (no_iotlb_memory)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -531,10 +532,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	} while (index != wrap);
 
 not_found:
+	tmp_io_tlb_used = io_tlb_used;
+
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 	if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
-		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n",
-			 size, io_tlb_nslabs, io_tlb_used);
+		dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+			 size, io_tlb_nslabs, tmp_io_tlb_used);
 	return DMA_MAPPING_ERROR;
 found:
 	io_tlb_used += nslots;
-- 
cgit v1.2.3


From 1b04aee7e2182454a663950e68084fa5ada9625a Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 12 Apr 2019 22:59:34 +0100
Subject: bpf: refactor propagate_liveness to eliminate duplicated for loop

Propagation for register and stack slot are finished in separate for loop,
while they are perfect to be put into a single loop.

This could also let them share some common variables in later patches.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 15ab6fa817ce..da285df492fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6254,10 +6254,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 					return err;
 			}
 		}
-	}
 
-	/* ... and stack slots */
-	for (frame = 0; frame <= vstate->curframe; frame++) {
+		/* Propagate stack slots. */
 		state = vstate->frame[frame];
 		parent = vparent->frame[frame];
 		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
-- 
cgit v1.2.3


From 3f8cafa4131f67d47c8de326c7dcd561cc65fb38 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 12 Apr 2019 22:59:35 +0100
Subject: bpf: refactor propagate_liveness to eliminate code redundance

Access to reg states were not factored out, the consequence is long code
for dereferencing them which made the indentation not good for reading.

This patch factor out these code so the core code in the loop could be
easier to follow.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index da285df492fd..6fd36a8ba1a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6232,8 +6232,9 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			      const struct bpf_verifier_state *vstate,
 			      struct bpf_verifier_state *vparent)
 {
-	int i, frame, err = 0;
+	struct bpf_reg_state *state_reg, *parent_reg;
 	struct bpf_func_state *state, *parent;
+	int i, frame, err = 0;
 
 	if (vparent->curframe != vstate->curframe) {
 		WARN(1, "propagate_live: parent frame %d current frame %d\n",
@@ -6243,28 +6244,33 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 	/* Propagate read liveness of registers... */
 	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 	for (frame = 0; frame <= vstate->curframe; frame++) {
+		parent = vparent->frame[frame];
+		state = vstate->frame[frame];
+		parent_reg = parent->regs;
+		state_reg = state->regs;
 		/* We don't need to worry about FP liveness, it's read-only */
 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
-			if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ)
+			if (parent_reg[i].live & REG_LIVE_READ)
 				continue;
-			if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) {
-				err = mark_reg_read(env, &vstate->frame[frame]->regs[i],
-						    &vparent->frame[frame]->regs[i]);
-				if (err)
-					return err;
-			}
+			if (!(state_reg[i].live & REG_LIVE_READ))
+				continue;
+			err = mark_reg_read(env, &state_reg[i], &parent_reg[i]);
+			if (err)
+				return err;
 		}
 
 		/* Propagate stack slots. */
-		state = vstate->frame[frame];
-		parent = vparent->frame[frame];
 		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
 			    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
-			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
+			parent_reg = &parent->stack[i].spilled_ptr;
+			state_reg = &state->stack[i].spilled_ptr;
+			if (parent_reg->live & REG_LIVE_READ)
 				continue;
-			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
-				mark_reg_read(env, &state->stack[i].spilled_ptr,
-					      &parent->stack[i].spilled_ptr);
+			if (!(state_reg->live & REG_LIVE_READ))
+				continue;
+			err = mark_reg_read(env, state_reg, parent_reg);
+			if (err)
+				return err;
 		}
 	}
 	return err;
-- 
cgit v1.2.3


From 55e7f3b5ac94a8c9f2e35961a45e9aa526a9e41d Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 12 Apr 2019 22:59:36 +0100
Subject: bpf: factor out reg and stack slot propagation into
 "propagate_liveness_reg"

After code refactor in previous patches, the propagation logic inside the
for loop in "propagate_liveness" becomes clear that they are good enough to
be factored out into a common function "propagate_liveness_reg".

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6fd36a8ba1a0..3fdb301c4f8c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6221,6 +6221,22 @@ static bool states_equal(struct bpf_verifier_env *env,
 	return true;
 }
 
+static int propagate_liveness_reg(struct bpf_verifier_env *env,
+				  struct bpf_reg_state *reg,
+				  struct bpf_reg_state *parent_reg)
+{
+	int err;
+
+	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+		return 0;
+
+	err = mark_reg_read(env, reg, parent_reg);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 /* A write screens off any subsequent reads; but write marks come from the
  * straight-line code between a state and its parent.  When we arrive at an
  * equivalent state (jump target or such) we didn't arrive by the straight-line
@@ -6250,11 +6266,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 		state_reg = state->regs;
 		/* We don't need to worry about FP liveness, it's read-only */
 		for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
-			if (parent_reg[i].live & REG_LIVE_READ)
-				continue;
-			if (!(state_reg[i].live & REG_LIVE_READ))
-				continue;
-			err = mark_reg_read(env, &state_reg[i], &parent_reg[i]);
+			err = propagate_liveness_reg(env, &state_reg[i],
+						     &parent_reg[i]);
 			if (err)
 				return err;
 		}
@@ -6264,11 +6277,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,
 			    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
 			parent_reg = &parent->stack[i].spilled_ptr;
 			state_reg = &state->stack[i].spilled_ptr;
-			if (parent_reg->live & REG_LIVE_READ)
-				continue;
-			if (!(state_reg->live & REG_LIVE_READ))
-				continue;
-			err = mark_reg_read(env, state_reg, parent_reg);
+			err = propagate_liveness_reg(env, state_reg,
+						     parent_reg);
 			if (err)
 				return err;
 		}
-- 
cgit v1.2.3


From c342dc109aa5a4f0bb36335cb441aaafc98b98ef Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 12 Apr 2019 22:59:37 +0100
Subject: bpf: refactor "check_reg_arg" to eliminate code redundancy

There are a few "regs[regno]" here are there across "check_reg_arg", this
patch factor it out into a simple "reg" pointer. The intention is to
simplify code indentation and make the later patches in this set look
cleaner.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fdb301c4f8c..c7220153c5b1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1177,30 +1177,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *reg, *regs = state->regs;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
+	reg = &regs[regno];
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
-		if (regs[regno].type == NOT_INIT) {
+		if (reg->type == NOT_INIT) {
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
 		/* We don't need to worry about FP liveness because it's read-only */
-		if (regno != BPF_REG_FP)
-			return mark_reg_read(env, &regs[regno],
-					     regs[regno].parent);
+		if (regno == BPF_REG_FP)
+			return 0;
+
+		return mark_reg_read(env, reg, reg->parent);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
 			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
-		regs[regno].live |= REG_LIVE_WRITTEN;
+		reg->live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
 			mark_reg_unknown(env, regs, regno);
 	}
-- 
cgit v1.2.3


From 26536e7c242e2b0f73c25c46fc50d2525ebe400b Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sat, 13 Apr 2019 13:22:44 -0400
Subject: locking/rwsem: Prevent unneeded warning during locking selftest

Disable the DEBUG_RWSEMS check when locking selftest is running with
debug_locks_silent flag set.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: http://lkml.kernel.org/r/20190413172259.2740-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 37db17890e36..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -30,7 +30,8 @@
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
-	if (WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+	if (!debug_locks_silent &&				\
+	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
 		#c, atomic_long_read(&(sem)->count),		\
 		(long)((sem)->owner), (long)current,		\
 		list_empty(&(sem)->wait_list) ? "" : "not "))	\
-- 
cgit v1.2.3


From 2dfed4565afe263751d2451ad22336ad806c25a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Apr 2019 12:28:04 +0200
Subject: lockdep: Remove the ULONG_MAX stack trace hackery

No architecture terminates the stack trace with ULONG_MAX anymore. Remove
the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190410103644.485737321@linutronix.de
---
 kernel/locking/lockdep.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e16766ff184b..2edf9501d906 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -444,17 +444,6 @@ static int save_trace(struct stack_trace *trace)
 
 	save_stack_trace(trace);
 
-	/*
-	 * Some daft arches put -1 at the end to indicate its a full trace.
-	 *
-	 * <rant> this is buggy anyway, since it takes a whole extra entry so a
-	 * complete trace that maxes out the entries provided will be reported
-	 * as incomplete, friggin useless </rant>
-	 */
-	if (trace->nr_entries != 0 &&
-	    trace->entries[trace->nr_entries-1] == ULONG_MAX)
-		trace->nr_entries--;
-
 	trace->max_entries = trace->nr_entries;
 
 	nr_stack_trace_entries += trace->nr_entries;
-- 
cgit v1.2.3


From accddc41b96915ab4e5d37796c6d17d70805999c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Apr 2019 12:28:08 +0200
Subject: latency_top: Remove the ULONG_MAX stack trace hackery

No architecture terminates the stack trace with ULONG_MAX anymore. The
consumer terminates on the first zero entry or at the number of entries, so
no functional change.

Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Link: https://lkml.kernel.org/r/20190410103644.853527514@linutronix.de
---
 kernel/latencytop.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..f5a90ab3c6b9 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
 				break;
 			}
 
-			/* 0 and ULONG_MAX entries mean end of backtrace: */
-			if (record == 0 || record == ULONG_MAX)
+			/* 0 entry marks end of backtrace: */
+			if (!record)
 				break;
 		}
 		if (same) {
@@ -210,8 +210,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 				break;
 			}
 
-			/* 0 and ULONG_MAX entries mean end of backtrace: */
-			if (record == 0 || record == ULONG_MAX)
+			/* 0 entry is end of backtrace */
+			if (!record)
 				break;
 		}
 		if (same) {
@@ -252,10 +252,10 @@ static int lstats_show(struct seq_file *m, void *v)
 				   lr->count, lr->time, lr->max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 				unsigned long bt = lr->backtrace[q];
+
 				if (!bt)
 					break;
-				if (bt == ULONG_MAX)
-					break;
+
 				seq_printf(m, " %ps", (void *)bt);
 			}
 			seq_puts(m, "\n");
-- 
cgit v1.2.3


From 4285f2fcef8001ead0f1c9315ba50302cab68cda Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Apr 2019 12:28:10 +0200
Subject: tracing: Remove the ULONG_MAX stack trace hackery

No architecture terminates the stack trace with ULONG_MAX anymore. As the
code checks the number of entries stored anyway there is no point in
keeping all that ULONG_MAX magic around.

The histogram code zeroes the storage before saving the stack, so if the
trace is shorter than the maximum number of entries it can terminate the
print loop if a zero entry is detected.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Link: https://lkml.kernel.org/r/20190410103645.048761764@linutronix.de
---
 kernel/trace/trace_events_hist.c |  2 +-
 kernel/trace/trace_stack.c       | 20 +++++---------------
 2 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 795aa2038377..21ceae299f7e 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5246,7 +5246,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
 	unsigned int i;
 
 	for (i = 0; i < max_entries; i++) {
-		if (stacktrace_entries[i] == ULONG_MAX)
+		if (!stacktrace_entries[i])
 			return;
 
 		seq_printf(m, "%*c", 1 + spaces, ' ');
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..c6e54ff25cae 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,8 +18,7 @@
 
 #include "trace.h"
 
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
-	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES + 1];
 unsigned stack_trace_index[STACK_TRACE_ENTRIES];
 
 /*
@@ -52,10 +51,7 @@ void stack_trace_print(void)
 			   stack_trace_max.nr_entries);
 
 	for (i = 0; i < stack_trace_max.nr_entries; i++) {
-		if (stack_dump_trace[i] == ULONG_MAX)
-			break;
-		if (i+1 == stack_trace_max.nr_entries ||
-				stack_dump_trace[i+1] == ULONG_MAX)
+		if (i + 1 == stack_trace_max.nr_entries)
 			size = stack_trace_index[i];
 		else
 			size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -150,8 +146,6 @@ check_stack(unsigned long ip, unsigned long *stack)
 		p = start;
 
 		for (; p < top && i < stack_trace_max.nr_entries; p++) {
-			if (stack_dump_trace[i] == ULONG_MAX)
-				break;
 			/*
 			 * The READ_ONCE_NOCHECK is used to let KASAN know that
 			 * this is not a stack-out-of-bounds error.
@@ -183,8 +177,6 @@ check_stack(unsigned long ip, unsigned long *stack)
 	}
 
 	stack_trace_max.nr_entries = x;
-	for (; x < i; x++)
-		stack_dump_trace[x] = ULONG_MAX;
 
 	if (task_stack_end_corrupted(current)) {
 		stack_trace_print();
@@ -286,7 +278,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
 	long n = *pos - 1;
 
-	if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+	if (n >= stack_trace_max.nr_entries)
 		return NULL;
 
 	m->private = (void *)n;
@@ -360,12 +352,10 @@ static int t_show(struct seq_file *m, void *v)
 
 	i = *(long *)v;
 
-	if (i >= stack_trace_max.nr_entries ||
-	    stack_dump_trace[i] == ULONG_MAX)
+	if (i >= stack_trace_max.nr_entries)
 		return 0;
 
-	if (i+1 == stack_trace_max.nr_entries ||
-	    stack_dump_trace[i+1] == ULONG_MAX)
+	if (i + 1 == stack_trace_max.nr_entries)
 		size = stack_trace_index[i];
 	else
 		size = stack_trace_index[i] - stack_trace_index[i+1];
-- 
cgit v1.2.3


From 2d87a0674bd60d855e4008e2d84f5b23d7cb9b7d Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 10 Apr 2019 11:14:19 +0200
Subject: timekeeping: Audit clock adjustments

Emit an audit record whenever the system clock is changed (i.e. shifted
by a non-zero offset) by a syscall from userspace. The syscalls than can
(at the time of writing) trigger such record are:
  - settimeofday(2), stime(2), clock_settime(2) -- via
    do_settimeofday64()
  - adjtimex(2), clock_adjtime(2) -- via do_adjtimex()

The new records have type AUDIT_TIME_INJOFFSET and contain the following
fields:
  - sec -- the 'seconds' part of the offset
  - nsec -- the 'nanoseconds' part of the offset

Example record (time was shifted backwards by ~15.875 seconds):

type=TIME_INJOFFSET msg=audit(1530616049.652:13): sec=-16 nsec=124887145

The records of this type will be associated with the corresponding
syscall records.

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
[PM: fixed a line width problem in __audit_tk_injoffset()]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c          | 7 +++++++
 kernel/time/timekeeping.c | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 51a2ceb3a1ca..3843495d0083 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2512,6 +2512,13 @@ void __audit_fanotify(unsigned int response)
 		AUDIT_FANOTIFY,	"resp=%u", response);
 }
 
+void __audit_tk_injoffset(struct timespec64 offset)
+{
+	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
+		  "sec=%lli nsec=%li",
+		  (long long)offset.tv_sec, offset.tv_nsec);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f986e1918d12..3d24be4cd607 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,6 +21,7 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/compiler.h>
+#include <linux/audit.h>
 
 #include "tick-internal.h"
 #include "ntp_internal.h"
@@ -1250,6 +1251,9 @@ out:
 	/* signal hrtimers about time change */
 	clock_was_set();
 
+	if (!ret)
+		audit_tk_injoffset(ts_delta);
+
 	return ret;
 }
 EXPORT_SYMBOL(do_settimeofday64);
@@ -2322,6 +2326,8 @@ int do_adjtimex(struct __kernel_timex *txc)
 		ret = timekeeping_inject_offset(&delta);
 		if (ret)
 			return ret;
+
+		audit_tk_injoffset(delta);
 	}
 
 	ktime_get_real_ts64(&ts);
-- 
cgit v1.2.3


From 7e8eda734d30de81d06a949c9bf9853c445ede4e Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 10 Apr 2019 11:14:20 +0200
Subject: ntp: Audit NTP parameters adjustment

Emit an audit record every time selected NTP parameters are modified
from userspace (via adjtimex(2) or clock_adjtime(2)). These parameters
may be used to indirectly change system clock, and thus their
modifications should be audited.

Such events will now generate records of type AUDIT_TIME_ADJNTPVAL
containing the following fields:
  - op -- which value was adjusted:
    - offset -- corresponding to the time_offset variable
    - freq   -- corresponding to the time_freq variable
    - status -- corresponding to the time_status variable
    - adjust -- corresponding to the time_adjust variable
    - tick   -- corresponding to the tick_usec variable
    - tai    -- corresponding to the timekeeping's TAI offset
  - old -- the old value
  - new -- the new value

Example records:

type=TIME_ADJNTPVAL msg=audit(1530616044.507:7): op=status old=64 new=8256
type=TIME_ADJNTPVAL msg=audit(1530616044.511:11): op=freq old=0 new=49180377088000

The records of this type will be associated with the corresponding
syscall records.

An overview of parameter changes that can be done via do_adjtimex()
(based on information from Miroslav Lichvar) and whether they are
audited:
  __timekeeping_set_tai_offset() -- sets the offset from the
                                    International Atomic Time
                                    (AUDITED)
  NTP variables:
    time_offset -- can adjust the clock by up to 0.5 seconds per call
                   and also speed it up or slow down by up to about
                   0.05% (43 seconds per day) (AUDITED)
    time_freq -- can speed up or slow down by up to about 0.05%
                 (AUDITED)
    time_status -- can insert/delete leap seconds and it also enables/
                   disables synchronization of the hardware real-time
                   clock (AUDITED)
    time_maxerror, time_esterror -- change error estimates used to
                                    inform userspace applications
                                    (NOT AUDITED)
    time_constant -- controls the speed of the clock adjustments that
                     are made when time_offset is set (NOT AUDITED)
    time_adjust -- can temporarily speed up or slow down the clock by up
                   to 0.05% (AUDITED)
    tick_usec -- a more extreme version of time_freq; can speed up or
                 slow down the clock by up to 10% (AUDITED)

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c           | 22 ++++++++++++++++++++++
 kernel/time/ntp.c          | 22 +++++++++++++++++++---
 kernel/time/ntp_internal.h |  4 +++-
 kernel/time/timekeeping.c  |  7 ++++++-
 4 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3843495d0083..5371b59bde36 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2519,6 +2519,28 @@ void __audit_tk_injoffset(struct timespec64 offset)
 		  (long long)offset.tv_sec, offset.tv_nsec);
 }
 
+static void audit_log_ntp_val(const struct audit_ntp_data *ad,
+			      const char *op, enum audit_ntp_type type)
+{
+	const struct audit_ntp_val *val = &ad->vals[type];
+
+	if (val->newval == val->oldval)
+		return;
+
+	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
+		  "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+}
+
+void __audit_ntp_log(const struct audit_ntp_data *ad)
+{
+	audit_log_ntp_val(ad, "offset",	AUDIT_NTP_OFFSET);
+	audit_log_ntp_val(ad, "freq",	AUDIT_NTP_FREQ);
+	audit_log_ntp_val(ad, "status",	AUDIT_NTP_STATUS);
+	audit_log_ntp_val(ad, "tai",	AUDIT_NTP_TAI);
+	audit_log_ntp_val(ad, "tick",	AUDIT_NTP_TICK);
+	audit_log_ntp_val(ad, "adjust",	AUDIT_NTP_ADJUST);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 92a90014a925..ac5555e25733 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/rtc.h>
+#include <linux/audit.h>
 
 #include "ntp_internal.h"
 #include "timekeeping_internal.h"
@@ -709,7 +710,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
  * kernel time-keeping variables. used by xntpd.
  */
 int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
-		  s32 *time_tai)
+		  s32 *time_tai, struct audit_ntp_data *ad)
 {
 	int result;
 
@@ -720,14 +721,29 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
 			/* adjtime() is independent from ntp_adjtime() */
 			time_adjust = txc->offset;
 			ntp_update_frequency();
+
+			audit_ntp_set_old(ad, AUDIT_NTP_ADJUST,	save_adjust);
+			audit_ntp_set_new(ad, AUDIT_NTP_ADJUST,	time_adjust);
 		}
 		txc->offset = save_adjust;
 	} else {
-
 		/* If there are input parameters, then process them: */
-		if (txc->modes)
+		if (txc->modes) {
+			audit_ntp_set_old(ad, AUDIT_NTP_OFFSET,	time_offset);
+			audit_ntp_set_old(ad, AUDIT_NTP_FREQ,	time_freq);
+			audit_ntp_set_old(ad, AUDIT_NTP_STATUS,	time_status);
+			audit_ntp_set_old(ad, AUDIT_NTP_TAI,	*time_tai);
+			audit_ntp_set_old(ad, AUDIT_NTP_TICK,	tick_usec);
+
 			process_adjtimex_modes(txc, time_tai);
 
+			audit_ntp_set_new(ad, AUDIT_NTP_OFFSET,	time_offset);
+			audit_ntp_set_new(ad, AUDIT_NTP_FREQ,	time_freq);
+			audit_ntp_set_new(ad, AUDIT_NTP_STATUS,	time_status);
+			audit_ntp_set_new(ad, AUDIT_NTP_TAI,	*time_tai);
+			audit_ntp_set_new(ad, AUDIT_NTP_TICK,	tick_usec);
+		}
+
 		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
 		if (!(time_status & STA_NANO))
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 40e6122e634e..908ecaa65fc3 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,6 +8,8 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai);
+extern int __do_adjtimex(struct __kernel_timex *txc,
+			 const struct timespec64 *ts,
+			 s32 *time_tai, struct audit_ntp_data *ad);
 extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d24be4cd607..f366f2fdf1b0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2307,6 +2307,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
 int do_adjtimex(struct __kernel_timex *txc)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
+	struct audit_ntp_data ad;
 	unsigned long flags;
 	struct timespec64 ts;
 	s32 orig_tai, tai;
@@ -2330,13 +2331,15 @@ int do_adjtimex(struct __kernel_timex *txc)
 		audit_tk_injoffset(delta);
 	}
 
+	audit_ntp_init(&ad);
+
 	ktime_get_real_ts64(&ts);
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&tk_core.seq);
 
 	orig_tai = tai = tk->tai_offset;
-	ret = __do_adjtimex(txc, &ts, &tai);
+	ret = __do_adjtimex(txc, &ts, &tai, &ad);
 
 	if (tai != orig_tai) {
 		__timekeeping_set_tai_offset(tk, tai);
@@ -2347,6 +2350,8 @@ int do_adjtimex(struct __kernel_timex *txc)
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
+	audit_ntp_log(&ad);
+
 	/* Update the multiplier immediately if frequency was set directly */
 	if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
 		timekeeping_advance(TK_ADV_FREQ);
-- 
cgit v1.2.3


From 02a8c817a31606b6b37c2b755f6569903f44241e Mon Sep 17 00:00:00 2001
From: Alban Crequy <alban@kinvolk.io>
Date: Sun, 14 Apr 2019 18:58:46 +0200
Subject: bpf: add map helper functions push, pop, peek in more BPF programs

commit f1a2e44a3aec ("bpf: add queue and stack maps") introduced new BPF
helper functions:
- BPF_FUNC_map_push_elem
- BPF_FUNC_map_pop_elem
- BPF_FUNC_map_peek_elem

but they were made available only for network BPF programs. This patch
makes them available for tracepoint, cgroup and lirc programs.

Signed-off-by: Alban Crequy <alban@kinvolk.io>
Cc: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/cgroup.c      | 6 ++++++
 kernel/trace/bpf_trace.c | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e58a6c247f56..fcde0f7b2585 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -713,6 +713,12 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_map_push_elem:
+		return &bpf_map_push_elem_proto;
+	case BPF_FUNC_map_pop_elem:
+		return &bpf_map_pop_elem_proto;
+	case BPF_FUNC_map_peek_elem:
+		return &bpf_map_peek_elem_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_local_storage:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..91800be0c8eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -569,6 +569,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_map_push_elem:
+		return &bpf_map_push_elem_proto;
+	case BPF_FUNC_map_pop_elem:
+		return &bpf_map_pop_elem_proto;
+	case BPF_FUNC_map_peek_elem:
+		return &bpf_map_peek_elem_proto;
 	case BPF_FUNC_probe_read:
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
-- 
cgit v1.2.3


From c68d224e5ed15605e651e2482c6ffd95915ddf58 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 8 Apr 2019 10:32:51 -0700
Subject: perf/core: Add perf_pmu_resched() as global function

This patch add perf_pmu_resched() a global function that can be called
to force rescheduling of events for a given PMU. The function locks
both cpuctx and task_ctx internally. This will be used by a subsequent
patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
[ Simplified the calling convention. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-2-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30a572e4c6f1..abbd4b3b96c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2478,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
+void perf_pmu_resched(struct pmu *pmu)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+	struct perf_event_context *task_ctx = cpuctx->task_ctx;
+
+	perf_ctx_lock(cpuctx, task_ctx);
+	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+	perf_ctx_unlock(cpuctx, task_ctx);
+}
+
 /*
  * Cross CPU call to install and enable a performance event
  *
-- 
cgit v1.2.3


From e2abb398115e9c33f3d1e25bf6d1d08badc58b13 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Tue, 16 Apr 2019 15:06:21 +0100
Subject: sched/fair: Remove unneeded prototype of capacity_of()

The prototype of that function was already hoisted up in:

  commit 3b1baa6496e6 ("sched/fair: Add 'group_misfit_task' load-balance type")

but that seems to have been missed. Get rid of the extra prototype.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Acked-by: Quentin Perret <quentin.perret@arm.com>
Cc: Dietmar.Eggemann@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: morten.rasmussen@arm.com
Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
Link: http://lkml.kernel.org/r/20190416140621.19884-1-valentin.schneider@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed7f5f8107b7..b6cc0703b850 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5116,7 +5116,6 @@ static inline void hrtick_update(struct rq *rq)
 
 #ifdef CONFIG_SMP
 static inline unsigned long cpu_util(int cpu);
-static unsigned long capacity_of(int cpu);
 
 static inline bool cpu_overutilized(int cpu)
 {
-- 
cgit v1.2.3


From 6d25be5782e482eb93e3de0c94d0a517879377d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Mar 2019 17:55:48 +0100
Subject: sched/core, workqueues: Distangle worker accounting from rq lock

The worker accounting for CPU bound workers is plugged into the core
scheduler code and the wakeup code. This is not a hard requirement and
can be avoided by keeping track of the state in the workqueue code
itself.

Keep track of the sleeping state in the worker itself and call the
notifier before entering the core scheduler. There might be false
positives when the task is woken between that call and actually
scheduling, but that's not really different from scheduling and being
woken immediately after switching away. When nr_running is updated when
the task is retunrning from schedule() then it is later compared when it
is done from ttwu().

[ bigeasy: preempt_disable() around wq_worker_sleeping() by Daniel Bristot de Oliveira ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/ad2b29b5715f970bffc1a7026cabd6ff0b24076a.1532952814.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c         | 88 +++++++++++----------------------------------
 kernel/workqueue.c          | 54 +++++++++++++---------------
 kernel/workqueue_internal.h |  5 +--
 3 files changed, 48 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778c48a7fda..6184a0856aab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1685,10 +1685,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-
-	/* If a worker is waking up, notify the workqueue: */
-	if (p->flags & PF_WQ_WORKER)
-		wq_worker_waking_up(p, cpu_of(rq));
 }
 
 /*
@@ -2106,56 +2102,6 @@ out:
 	return success;
 }
 
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @rf: request-queue flags for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
-{
-	struct rq *rq = task_rq(p);
-
-	if (WARN_ON_ONCE(rq != this_rq()) ||
-	    WARN_ON_ONCE(p == current))
-		return;
-
-	lockdep_assert_held(&rq->lock);
-
-	if (!raw_spin_trylock(&p->pi_lock)) {
-		/*
-		 * This is OK, because current is on_cpu, which avoids it being
-		 * picked for load-balance and preemption/IRQs are still
-		 * disabled avoiding further scheduler activity on it and we've
-		 * not yet picked a replacement task.
-		 */
-		rq_unlock(rq, rf);
-		raw_spin_lock(&p->pi_lock);
-		rq_relock(rq, rf);
-	}
-
-	if (!(p->state & TASK_NORMAL))
-		goto out;
-
-	trace_sched_waking(p);
-
-	if (!task_on_rq_queued(p)) {
-		if (p->in_iowait) {
-			delayacct_blkio_end(p);
-			atomic_dec(&rq->nr_iowait);
-		}
-		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
-	}
-
-	ttwu_do_wakeup(rq, p, 0, rf);
-	ttwu_stat(p, smp_processor_id(), 0);
-out:
-	raw_spin_unlock(&p->pi_lock);
-}
-
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
@@ -3472,19 +3418,6 @@ static void __sched notrace __schedule(bool preempt)
 				atomic_inc(&rq->nr_iowait);
 				delayacct_blkio_start();
 			}
-
-			/*
-			 * If a worker went to sleep, notify and ask workqueue
-			 * whether it wants to wake up a task to maintain
-			 * concurrency.
-			 */
-			if (prev->flags & PF_WQ_WORKER) {
-				struct task_struct *to_wakeup;
-
-				to_wakeup = wq_worker_sleeping(prev);
-				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup, &rf);
-			}
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -3544,6 +3477,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
 	if (!tsk->state || tsk_is_pi_blocked(tsk))
 		return;
+
+	/*
+	 * If a worker went to sleep, notify and ask workqueue whether
+	 * it wants to wake up a task to maintain concurrency.
+	 * As this function is called inside the schedule() context,
+	 * we disable preemption to avoid it calling schedule() again
+	 * in the possible wakeup of a kworker.
+	 */
+	if (tsk->flags & PF_WQ_WORKER) {
+		preempt_disable();
+		wq_worker_sleeping(tsk);
+		preempt_enable_no_resched();
+	}
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
@@ -3552,6 +3499,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
+static void sched_update_worker(struct task_struct *tsk)
+{
+	if (tsk->flags & PF_WQ_WORKER)
+		wq_worker_running(tsk);
+}
+
 asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
@@ -3562,6 +3515,7 @@ asmlinkage __visible void __sched schedule(void)
 		__schedule(false);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
+	sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ddee541ea97a..56180c9286f5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
 }
 
 /**
- * wq_worker_waking_up - a worker is waking up
+ * wq_worker_running - a worker is running again
  * @task: task waking up
- * @cpu: CPU @task is waking up to
  *
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
  */
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
 {
 	struct worker *worker = kthread_data(task);
 
-	if (!(worker->flags & WORKER_NOT_RUNNING)) {
-		WARN_ON_ONCE(worker->pool->cpu != cpu);
+	if (!worker->sleeping)
+		return;
+	if (!(worker->flags & WORKER_NOT_RUNNING))
 		atomic_inc(&worker->pool->nr_running);
-	}
+	worker->sleeping = 0;
 }
 
 /**
  * wq_worker_sleeping - a worker is going to sleep
  * @task: task going to sleep
  *
- * This function is called during schedule() when a busy worker is
- * going to sleep.  Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
  */
-struct task_struct *wq_worker_sleeping(struct task_struct *task)
+void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+	struct worker *next, *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
 	 * checking NOT_RUNNING.
 	 */
 	if (worker->flags & WORKER_NOT_RUNNING)
-		return NULL;
+		return;
 
 	pool = worker->pool;
 
-	/* this can only happen on the local cpu */
-	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
-		return NULL;
+	if (WARN_ON_ONCE(worker->sleeping))
+		return;
+
+	worker->sleeping = 1;
+	spin_lock_irq(&pool->lock);
 
 	/*
 	 * The counterpart of the following dec_and_test, implied mb,
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
 	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
-	    !list_empty(&pool->worklist))
-		to_wakeup = first_idle_worker(pool);
-	return to_wakeup ? to_wakeup->task : NULL;
+	    !list_empty(&pool->worklist)) {
+		next = first_idle_worker(pool);
+		if (next)
+			wake_up_process(next->task);
+	}
+	spin_unlock_irq(&pool->lock);
 }
 
 /**
@@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
 		 *
 		 * WRITE_ONCE() is necessary because @worker->flags may be
 		 * tested without holding any lock in
-		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
+		 * wq_worker_running().  Without it, NOT_RUNNING test may
 		 * fail incorrectly leading to premature concurrency
 		 * management operations.
 		 */
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index cb68b03ca89a..498de0e909a4 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
+	int			sleeping;	/* None */
 
 	/*
 	 * Opaque string set with work_set_desc().  Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
  * Scheduler hooks for concurrency managed workqueue.  Only to be used from
  * sched/ and workqueue.c.
  */
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
 work_func_t wq_worker_last_func(struct task_struct *task);
 
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
-- 
cgit v1.2.3


From 1b174a2cb67a3a156d5a28426ae14241e6dfa655 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Apr 2019 09:53:13 +0200
Subject: sched/core: Remove ttwu_activate()

After the removal of try_to_wake_up_local(), there is only one user of
ttwu_activate() left, and since it is a trivial function, remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6184a0856aab..3feb83df322e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1681,12 +1681,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 		__schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }
 
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
-	activate_task(rq, p, en_flags);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-}
-
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
@@ -1738,7 +1732,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		en_flags |= ENQUEUE_MIGRATED;
 #endif
 
-	ttwu_activate(rq, p, en_flags);
+	activate_task(rq, p, en_flags);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	ttwu_do_wakeup(rq, p, wake_flags, rf);
 }
 
-- 
cgit v1.2.3


From 7dd7788411646c9619aa6495f832bc0a9b0146b5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Apr 2019 09:59:05 +0200
Subject: sched/core: Unify p->on_rq updates

Almost all {,de}activate_task() invocations pair with p->on_rq
updates, the exception being the usage in rt/deadline which hold both
rq locks and therefore don't strictly need to set
TASK_ON_RQ_MIGRATING, but it is harmless if we do anyway.

Put the updates in {,de}activate_task() and cut down on repetition.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 ++++-----
 kernel/sched/fair.c | 2 --
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3feb83df322e..f4838b78b9f9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, flags);
+
+	p->on_rq = TASK_ON_RQ_QUEUED;
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 
@@ -1237,11 +1241,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		rq_pin_lock(src_rq, &srf);
 		rq_pin_lock(dst_rq, &drf);
 
-		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
-		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 
 		rq_unpin_lock(dst_rq, &drf);
@@ -1733,7 +1735,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 
 	activate_task(rq, p, en_flags);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	ttwu_do_wakeup(rq, p, wake_flags, rf);
 }
 
@@ -2408,7 +2409,6 @@ void wake_up_new_task(struct task_struct *p)
 	post_init_entity_util_avg(p);
 
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -3407,7 +3407,6 @@ static void __sched notrace __schedule(bool preempt)
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-			prev->on_rq = 0;
 
 			if (prev->in_iowait) {
 				atomic_inc(&rq->nr_iowait);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b6cc0703b850..e5b100b6ba4e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7491,7 +7491,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	p->on_rq = TASK_ON_RQ_MIGRATING;
 	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
 	set_task_cpu(p, env->dst_cpu);
 }
@@ -7627,7 +7626,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	BUG_ON(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
cgit v1.2.3


From 2d65c42b43e53d61f1fd6b8d0a097451a4cffa24 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 10 Apr 2019 12:09:14 -0500
Subject: genirq/devres: Use struct_size() in devm_kzalloc()

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

size = sizeof(struct foo) + count * sizeof(struct boo);
instance = devm_kzalloc(dev, size, GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can
now use the new struct_size() helper.

instance = devm_kzalloc(dev, struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190410170914.GA16161@embeddedor
---
 kernel/irq/devres.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index f808c6a97dcc..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
 			    irq_flow_handler_t handler)
 {
 	struct irq_chip_generic *gc;
-	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
 
-	gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+	gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
 	if (gc)
 		irq_init_generic_chip(gc, name, num_ct,
 				      irq_base, reg_base, handler);
-- 
cgit v1.2.3


From 0d306c31b2f77391dacdeaad4470c577f2aecc4f Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Tue, 16 Apr 2019 18:13:01 +0900
Subject: bpf: use BPF_CAST_CALL for casting bpf call

verifier.c uses BPF_CAST_CALL for casting bpf call except at one
place in jit_subprogs(). Let's use the macro for consistency.

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c7220153c5b1..db301e9b5295 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7647,9 +7647,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-				func[subprog]->bpf_func -
-				__bpf_call_base;
+			insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
+				    __bpf_call_base;
 		}
 
 		/* we use the aux data to keep a list of the start addresses
-- 
cgit v1.2.3


From 98af8452945c55652de68536afdde3b520fec429 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 12 Apr 2019 15:39:28 -0500
Subject: cpu/speculation: Add 'mitigations=' cmdline option

Keeping track of the number of mitigations for all the CPU speculation
bugs has become overwhelming for many users.  It's getting more and more
complicated to decide which mitigations are needed for a given
architecture.  Complicating matters is the fact that each arch tends to
have its own custom way to mitigate the same vulnerability.

Most users fall into a few basic categories:

a) they want all mitigations off;

b) they want all reasonable mitigations on, with SMT enabled even if
   it's vulnerable; or

c) they want all reasonable mitigations on, with SMT disabled if
   vulnerable.

Define a set of curated, arch-independent options, each of which is an
aggregation of existing options:

- mitigations=off: Disable all mitigations.

- mitigations=auto: [default] Enable all the default mitigations, but
  leave SMT enabled, even if it's vulnerable.

- mitigations=auto,nosmt: Enable all the default mitigations, disabling
  SMT if needed by a mitigation.

Currently, these options are placeholders which don't actually do
anything.  They will be fleshed out in upcoming patches.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jiri Kosina <jkosina@suse.cz> (on x86)
Reviewed-by: Jiri Kosina <jkosina@suse.cz>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tyler Hicks <tyhicks@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Link: https://lkml.kernel.org/r/b07a8ef9b7c5055c3a4637c87d07c296d5016fe0.1555085500.git.jpoimboe@redhat.com
---
 kernel/cpu.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d1c6d152da89..e70a90634b41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2279,3 +2279,18 @@ void __init boot_cpu_hotplug_init(void)
 #endif
 	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
 }
+
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+	if (!strcmp(arg, "off"))
+		cpu_mitigations = CPU_MITIGATIONS_OFF;
+	else if (!strcmp(arg, "auto"))
+		cpu_mitigations = CPU_MITIGATIONS_AUTO;
+	else if (!strcmp(arg, "auto,nosmt"))
+		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+
+	return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
-- 
cgit v1.2.3


From 77361825bb01ecadf3ac8622e2e4dbc28806e858 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 12 Apr 2019 17:07:32 +0200
Subject: bpf: cpumap use ptr_ring_consume_batched

Move ptr_ring dequeue outside loop, that allocate SKBs and calls network
stack, as these operations that can take some time. The ptr_ring is a
communication channel between CPUs, where we want to reduce/limit any
cacheline bouncing.

Do a concentrated bulk dequeue via ptr_ring_consume_batched, to shorten the
period and times the remote cacheline in ptr_ring is read

Batch size 8 is both to (1) limit BH-disable period, and (2) consume one
cacheline on 64-bit archs. After reducing the BH-disable section further
then we can consider changing this, while still thinking about L1 cacheline
size being active.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumap.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3c18260403dd..430103e182a0 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 	}
 }
 
+#define CPUMAP_BATCH 8
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -252,8 +254,9 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
-		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_frame *xdpf;
+		unsigned int drops = 0, sched = 0;
+		void *frames[CPUMAP_BATCH];
+		int i, n;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -269,14 +272,16 @@ static int cpu_map_kthread_run(void *data)
 			sched = cond_resched();
 		}
 
-		/* Process packets in rcpu->queue */
-		local_bh_disable();
 		/*
 		 * The bpf_cpu_map_entry is single consumer, with this
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
+		n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+
+		local_bh_disable();
+		for (i = 0; i < n; i++) {
+			struct xdp_frame *xdpf = frames[i];
 			struct sk_buff *skb;
 			int ret;
 
@@ -290,13 +295,9 @@ static int cpu_map_kthread_run(void *data)
 			ret = netif_receive_skb_core(skb);
 			if (ret == NET_RX_DROP)
 				drops++;
-
-			/* Limit BH-disable period */
-			if (++processed == 8)
-				break;
 		}
 		/* Feedback loop via tracepoint */
-		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
 
 		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
-- 
cgit v1.2.3


From 8f0504a97e1ba6b70e1c8b5a88255c280f263287 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 12 Apr 2019 17:07:43 +0200
Subject: bpf: cpumap do bulk allocation of SKBs

As cpumap now batch consume xdp_frame's from the ptr_ring, it knows how many
SKBs it need to allocate. Thus, lets bulk allocate these SKBs via
kmem_cache_alloc_bulk() API, and use the previously introduced function
build_skb_around().

Notice that the flag __GFP_ZERO asks the slab/slub allocator to clear the
memory for us. This does clear a larger area than needed, but my micro
benchmarks on Intel CPUs show that this is slightly faster due to being a
cacheline aligned area is cleared for the SKBs. (For SLUB allocator, there
is a future optimization potential, because SKBs will with high probability
originate from same page. If we can find/identify continuous memory areas
then the Intel CPU memset rep stos will have a real performance gain.)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumap.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 430103e182a0..732d6ced3987 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 }
 
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_frame *xdpf)
+					 struct xdp_frame *xdpf,
+					 struct sk_buff *skb)
 {
 	unsigned int hard_start_headroom;
 	unsigned int frame_size;
 	void *pkt_data_start;
-	struct sk_buff *skb;
 
 	/* Part of headroom was reserved to xdpf */
 	hard_start_headroom = sizeof(struct xdp_frame) +  xdpf->headroom;
@@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	pkt_data_start = xdpf->data - hard_start_headroom;
-	skb = build_skb(pkt_data_start, frame_size);
-	if (!skb)
+	skb = build_skb_around(skb, pkt_data_start, frame_size);
+	if (unlikely(!skb))
 		return NULL;
 
 	skb_reserve(skb, hard_start_headroom);
@@ -256,7 +256,9 @@ static int cpu_map_kthread_run(void *data)
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int drops = 0, sched = 0;
 		void *frames[CPUMAP_BATCH];
-		int i, n;
+		void *skbs[CPUMAP_BATCH];
+		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+		int i, n, m;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -278,14 +280,20 @@ static int cpu_map_kthread_run(void *data)
 		 * consume side valid as no-resize allowed of queue.
 		 */
 		n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+		m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
+		if (unlikely(m == 0)) {
+			for (i = 0; i < n; i++)
+				skbs[i] = NULL; /* effect: xdp_return_frame */
+			drops = n;
+		}
 
 		local_bh_disable();
 		for (i = 0; i < n; i++) {
 			struct xdp_frame *xdpf = frames[i];
-			struct sk_buff *skb;
+			struct sk_buff *skb = skbs[i];
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdpf);
+			skb = cpu_map_build_skb(rcpu, xdpf, skb);
 			if (!skb) {
 				xdp_return_frame(xdpf);
 				continue;
-- 
cgit v1.2.3


From 86d231459d6dc9094e70c35c3517f4ef860b2f1e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 12 Apr 2019 17:07:48 +0200
Subject: bpf: cpumap memory prefetchw optimizations for struct page

A lot of the performance gain comes from this patch.

While analysing performance overhead it was found that the largest CPU
stalls were caused when touching the struct page area. It is first read with
a READ_ONCE from build_skb_around via page_is_pfmemalloc(), and when freed
written by page_frag_free() call.

Measurements show that the prefetchw (W) variant operation is needed to
achieve the performance gain. We believe this optimization it two fold,
first the W-variant saves one step in the cache-coherency protocol, and
second it helps us to avoid the non-temporal prefetch HW optimizations and
bring this into all cache-levels. It might be worth investigating if
prefetch into L2 will have the same benefit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumap.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 732d6ced3987..cf727d77c6c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -280,6 +280,18 @@ static int cpu_map_kthread_run(void *data)
 		 * consume side valid as no-resize allowed of queue.
 		 */
 		n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+
+		for (i = 0; i < n; i++) {
+			void *f = frames[i];
+			struct page *page = virt_to_page(f);
+
+			/* Bring struct page memory area to curr CPU. Read by
+			 * build_skb_around via page_is_pfmemalloc(), and when
+			 * freed written by page_frag_free call.
+			 */
+			prefetchw(page);
+		}
+
 		m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
 		if (unlikely(m == 0)) {
 			for (i = 0; i < n; i++)
-- 
cgit v1.2.3


From 0d2cc3b3453254f1c56f9456ba03e092ed4cfb72 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 2 Apr 2019 18:02:41 +0200
Subject: locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS &&
 CONFIG_PROVE_LOCKING

valid_state() and print_usage_bug*() functions are not used beyond
irq locking correctness checks under CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING.

Sadly the "unused function" warning wouldn't fire because valid_state()
is inline so the unused case has remained unseen until now.

So move them inside the appropriate CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING
section.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190402160244.32434-2-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..9c5819ef4a28 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2784,6 +2784,12 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
 
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+		     enum lock_usage_bit new_bit);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+
 static void
 print_usage_bug_scenario(struct held_lock *lock)
 {
@@ -2853,10 +2859,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 	return 1;
 }
 
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit);
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
-- 
cgit v1.2.3


From c902a1e8d9c9b47cd8faa16892710247cdda9b02 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 2 Apr 2019 18:02:42 +0200
Subject: locking/lockdep: Map remaining magic numbers to lock usage mask names

Clarify the code with mapping some more constant numbers that haven't
been named after their corresponding LOCK_USAGE_* symbol.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190402160244.32434-3-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9c5819ef4a28..2288aa2fa4c6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -516,11 +516,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 {
 	char c = '.';
 
-	if (class->usage_mask & lock_flag(bit + 2))
+	if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
 		c = '+';
 	if (class->usage_mask & lock_flag(bit)) {
 		c = '-';
-		if (class->usage_mask & lock_flag(bit + 2))
+		if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
 			c = '?';
 	}
 
@@ -1971,7 +1971,10 @@ static const char *state_rnames[] = {
 
 static inline const char *state_name(enum lock_usage_bit bit)
 {
-	return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+	if (bit & LOCK_USAGE_READ_MASK)
+		return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+	else
+		return state_names[bit >> LOCK_USAGE_DIR_MASK];
 }
 
 static int exclusive_bit(int new_bit)
@@ -3017,7 +3020,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
 static inline int state_verbose(enum lock_usage_bit bit,
 				struct lock_class *class)
 {
-	return state_verbose_f[bit >> 2](class);
+	return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
 }
 
 typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
-- 
cgit v1.2.3


From 627f364d24c009b61c9199b2c75006e35c294675 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 2 Apr 2019 18:02:43 +0200
Subject: locking/lockdep: Use expanded masks on find_usage_*() functions

In order to optimize check_irq_usage() and factorize all the IRQ usage
validations we'll need to be able to check multiple lock usage bits at
once. Prepare the low level usage mask check functions for that purpose.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190402160244.32434-4-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2288aa2fa4c6..5e149dd78298 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1682,9 +1682,9 @@ check_redundant(struct lock_list *root, struct lock_class *target,
  * without creating any illegal irq-safe -> irq-unsafe lock dependency.
  */
 
-static inline int usage_match(struct lock_list *entry, void *bit)
+static inline int usage_match(struct lock_list *entry, void *mask)
 {
-	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+	return entry->class->usage_mask & *(unsigned long *)mask;
 }
 
 
@@ -1700,14 +1700,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
  * Return <0 on error.
  */
 static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
 			struct lock_list **target_entry)
 {
 	int result;
 
 	debug_atomic_inc(nr_find_usage_forwards_checks);
 
-	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+	result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
 
 	return result;
 }
@@ -1723,14 +1723,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
  * Return <0 on error.
  */
 static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
 			struct lock_list **target_entry)
 {
 	int result;
 
 	debug_atomic_inc(nr_find_usage_backwards_checks);
 
-	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+	result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
 
 	return result;
 }
@@ -1935,7 +1935,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 	this.parent = NULL;
 
 	this.class = hlock_class(prev);
-	ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+	ret = find_usage_backwards(&this, lock_flag(bit_backwards), &target_entry);
 	if (ret < 0)
 		return print_bfs_bug(ret);
 	if (ret == 1)
@@ -1943,7 +1943,7 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 	that.parent = NULL;
 	that.class = hlock_class(next);
-	ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
+	ret = find_usage_forwards(&that, lock_flag(bit_forwards), &target_entry1);
 	if (ret < 0)
 		return print_bfs_bug(ret);
 	if (ret == 1)
@@ -2941,7 +2941,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 
 	root.parent = NULL;
 	root.class = hlock_class(this);
-	ret = find_usage_forwards(&root, bit, &target_entry);
+	ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
 	if (ret < 0)
 		return print_bfs_bug(ret);
 	if (ret == 1)
@@ -2965,7 +2965,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 
 	root.parent = NULL;
 	root.class = hlock_class(this);
-	ret = find_usage_backwards(&root, bit, &target_entry);
+	ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
 	if (ret < 0)
 		return print_bfs_bug(ret);
 	if (ret == 1)
-- 
cgit v1.2.3


From 8808a7c65423cdd07ea12f9ecd812e56c7857421 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Apr 2019 13:59:03 +0200
Subject: locking/lockdep: Generate LOCKF_ bit composites

Instead of open-coding the bitmasks, generate them using the
lockdep_states.h header.

This prepares for additional states, which would make the manual masks
tedious and error prone.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep_internals.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..2b3ffd4117ad 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,29 @@ enum {
 	__LOCKF(USED)
 };
 
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE |
+static const unsigned long LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+	0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE |
+static const unsigned long LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+	0;
+#undef LOCKDEP_STATE
 
-#define LOCKF_ENABLED_IRQ_READ \
-		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
-		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE##_READ |
+static const unsigned long LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+	0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE##_READ |
+static const unsigned long LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+	0;
+#undef LOCKDEP_STATE
 
 /*
  * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
-- 
cgit v1.2.3


From 76e1552466ff2da8b909df0fff3600ec1c27edcc Mon Sep 17 00:00:00 2001
From: Arash Fotouhi <arash@arashfotouhi.com>
Date: Fri, 22 Mar 2019 19:28:32 -0700
Subject: watchdog: Fix typo in comment

Signed-off-by: Arash Fotouhi <arash@arashfotouhi.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: loberman@redhat.com
Cc: vincent.whitchurch@axis.com
Link: http://lkml.kernel.org/r/1553308112-3513-1-git-send-email-arash@arashfotouhi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6a5787233113..7f9e7b9306fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -590,7 +590,7 @@ static void lockup_detector_reconfigure(void)
  * Create the watchdog thread infrastructure and configure the detector(s).
  *
  * The threads are not unparked as watchdog_allowed_mask is empty.  When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
  * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
  */
 static __init void lockup_detector_setup(void)
-- 
cgit v1.2.3


From 471ba0e686cb13752bc1ff3216c54b69a2d250ea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 9 Apr 2019 19:34:03 +1000
Subject: irq_work: Do not raise an IPI when queueing work on the local CPU

The QEMU PowerPC/PSeries machine model was not expecting a self-IPI,
and it may be a bit surprising thing to do, so have irq_work_queue_on
do local queueing when target is the current CPU.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190409093403.20994-1-npiggin@gmail.com
[ Simplified the preprocessor comments.
  Fixed unbalanced curly brackets pointed out by Thomas. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq_work.c | 75 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
 	 */
 }
 
-/*
- * Enqueue the irq_work @work on @cpu unless it's already pending
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
+static void __irq_work_queue_local(struct irq_work *work)
 {
-	/* All work should have been flushed before going offline */
-	WARN_ON_ONCE(cpu_is_offline(cpu));
-
-#ifdef CONFIG_SMP
-
-	/* Arch remote IPI send/receive backend aren't NMI safe */
-	WARN_ON_ONCE(in_nmi());
+	/* If the work is "lazy", handle it from next tick if any */
+	if (work->flags & IRQ_WORK_LAZY) {
+		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+		    tick_nohz_tick_stopped())
+			arch_irq_work_raise();
+	} else {
+		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+			arch_irq_work_raise();
+	}
+}
 
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
 	/* Only queue if not already pending */
 	if (!irq_work_claim(work))
 		return false;
 
-	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-		arch_send_call_function_single_ipi(cpu);
-
-#else /* #ifdef CONFIG_SMP */
-	irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
+	/* Queue the entry and raise the IPI if needed. */
+	preempt_disable();
+	__irq_work_queue_local(work);
+	preempt_enable();
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(irq_work_queue);
 
-/* Enqueue the irq work @work on the current CPU */
-bool irq_work_queue(struct irq_work *work)
+/*
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
+#ifndef CONFIG_SMP
+	return irq_work_queue(work);
+
+#else /* CONFIG_SMP: */
+	/* All work should have been flushed before going offline */
+	WARN_ON_ONCE(cpu_is_offline(cpu));
+
 	/* Only queue if not already pending */
 	if (!irq_work_claim(work))
 		return false;
 
-	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
-
-	/* If the work is "lazy", handle it from next tick if any */
-	if (work->flags & IRQ_WORK_LAZY) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-		    tick_nohz_tick_stopped())
-			arch_irq_work_raise();
+	if (cpu != smp_processor_id()) {
+		/* Arch remote IPI send/receive backend aren't NMI safe */
+		WARN_ON_ONCE(in_nmi());
+		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+			arch_send_call_function_single_ipi(cpu);
 	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
-			arch_irq_work_raise();
+		__irq_work_queue_local(work);
 	}
-
 	preempt_enable();
 
 	return true;
+#endif /* CONFIG_SMP */
 }
-EXPORT_SYMBOL_GPL(irq_work_queue);
+
 
 bool irq_work_needs_cpu(void)
 {
-- 
cgit v1.2.3


From b1546edcf2aab710a5afc98d65c948a4bfac0353 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 18 Apr 2019 22:47:13 +0800
Subject: sched/core: Make some functions static

Fix these sparse warnings:

  kernel/sched/core.c:6577:11: warning: symbol 'min_cfs_quota_period' was not declared. Should it be static?
  kernel/sched/core.c:6657:5: warning: symbol 'tg_set_cfs_quota' was not declared. Should it be static?
  kernel/sched/core.c:6670:6: warning: symbol 'tg_get_cfs_quota' was not declared. Should it be static?
  kernel/sched/core.c:6683:5: warning: symbol 'tg_set_cfs_period' was not declared. Should it be static?
  kernel/sched/core.c:6693:6: warning: symbol 'tg_get_cfs_period' was not declared. Should it be static?
  kernel/sched/fair.c:2596:6: warning: symbol 'task_tick_numa' was not declared. Should it be static?

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20190418144713.34332-1-yuehaibing@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 10 +++++-----
 kernel/sched/fair.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f4838b78b9f9..8b64ef0d5589 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6522,7 +6522,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
@@ -6602,7 +6602,7 @@ out_unlock:
 	return ret;
 }
 
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
 	u64 quota, period;
 
@@ -6615,7 +6615,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 
-long tg_get_cfs_quota(struct task_group *tg)
+static long tg_get_cfs_quota(struct task_group *tg)
 {
 	u64 quota_us;
 
@@ -6628,7 +6628,7 @@ long tg_get_cfs_quota(struct task_group *tg)
 	return quota_us;
 }
 
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
 	u64 quota, period;
 
@@ -6638,7 +6638,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 
-long tg_get_cfs_period(struct task_group *tg)
+static long tg_get_cfs_period(struct task_group *tg)
 {
 	u64 cfs_period_us;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e5b100b6ba4e..13bafe350abf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2593,7 +2593,7 @@ out:
 /*
  * Drive the periodic memory faults..
  */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
-- 
cgit v1.2.3


From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 17 Apr 2019 16:35:49 -0400
Subject: ipv6: Add rate limit mask for ICMPv6 messages

To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP
message types use larger numeric values, a simple bitmask doesn't fit.
I use large bitmap. The input and output are the in form of list of
ranges. Set the default to rate limit all error messages but Packet Too
Big. For Packet Too Big, use ratemask instead of hard-coded.

There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow()
aren't called. This patch only adds them to icmpv6_echo_reply().

Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says
that it is also acceptable to rate limit informational messages. Thus,
I removed the current hard-coded behavior of icmpv6_mask_allow() that
doesn't rate limit informational messages.

v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL
    isn't defined, expand the description in ip-sysctl.txt and remove
    unnecessary conditional before kfree().
v3: Inline the bitmap instead of dynamically allocated. Still is a
    pointer to it is needed because of the way proc_do_large_bitmap work.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9ec050bcf46..599510a3355e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3326,6 +3326,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
     return -ENOSYS;
 }
 
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
 
 #endif /* CONFIG_PROC_SYSCTL */
 
@@ -3366,3 +3371,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(proc_do_large_bitmap);
-- 
cgit v1.2.3


From bee9853932e90ce94bce4276ec6b7b06bc48070b Mon Sep 17 00:00:00 2001
From: Joel Savitz <jsavitz@redhat.com>
Date: Wed, 6 Mar 2019 20:13:33 -0500
Subject: sched/core: Fix typo in comment

Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1551921213-813-1-git-send-email-jsavitz@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b64ef0d5589..fb09eaad1d3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -924,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 }
 
 /*
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
  */
 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
-- 
cgit v1.2.3


From bff9504bfc9c5c6610b42d47f689f350fd969eb8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 5 Mar 2019 14:47:53 -0500
Subject: rseq: Clean up comments by reflecting removal of event counter

The "event counter" was removed from rseq before it was merged upstream.
However, a few comments in the source code still refer to it. Adapt the
comments to match reality.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Lameter <cl@linux.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20190305194755.2602-2-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/rseq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..849afe749131 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
  * - signal delivery,
  * and return to user-space.
  *
- * This is how we can ensure that the entire rseq critical section,
- * consisting of both the C part and the assembly instruction sequence,
+ * This is how we can ensure that the entire rseq critical section
  * will issue the commit instruction only if executed atomically with
  * respect to other threads scheduled on the same CPU, and with respect
  * to signal handlers.
-- 
cgit v1.2.3


From 83b0b15bcb0f700e7c1d070aae2e7841170a4c33 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 5 Mar 2019 14:47:54 -0500
Subject: rseq: Remove superfluous rseq_len from task_struct

The rseq system call, when invoked with flags of "0" or
"RSEQ_FLAG_UNREGISTER" values, expects the rseq_len parameter to
be equal to sizeof(struct rseq), which is fixed-size and fixed-layout,
specified in uapi linux/rseq.h.

Expecting a fixed size for rseq_len is a design choice that ensures
multiple libraries and application defining __rseq_abi in the same
process agree on its exact size.

Considering that this size is and will always be the same value, there
is no point in saving this value within task_struct rseq_len. Remove
this field from task_struct.

No change in functionality intended.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Lameter <cl@linux.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20190305194755.2602-3-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/rseq.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 849afe749131..9424ee90589e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -313,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 		/* Unregister rseq for current thread. */
 		if (current->rseq != rseq || !current->rseq)
 			return -EINVAL;
-		if (current->rseq_len != rseq_len)
+		if (rseq_len != sizeof(*rseq))
 			return -EINVAL;
 		if (current->rseq_sig != sig)
 			return -EPERM;
@@ -321,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 		if (ret)
 			return ret;
 		current->rseq = NULL;
-		current->rseq_len = 0;
 		current->rseq_sig = 0;
 		return 0;
 	}
@@ -335,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 		 * the provided address differs from the prior
 		 * one.
 		 */
-		if (current->rseq != rseq || current->rseq_len != rseq_len)
+		if (current->rseq != rseq || rseq_len != sizeof(*rseq))
 			return -EINVAL;
 		if (current->rseq_sig != sig)
 			return -EPERM;
@@ -353,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 	if (!access_ok(rseq, rseq_len))
 		return -EFAULT;
 	current->rseq = rseq;
-	current->rseq_len = rseq_len;
 	current->rseq_sig = sig;
 	/*
 	 * If rseq was previously inactive, and has just been
-- 
cgit v1.2.3


From f6c6010a07734103a31faa0cc977641b358c45b0 Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Tue, 5 Mar 2019 16:34:32 +0800
Subject: mm/resource: Use resource_overlaps() to simplify region_intersects()

The three checks in region_intersects() are basically an open-coded version
of resource_overlaps() - so use the real thing.

Also fix typos in comments while at it.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Like Xu <like.xu@linux.intel.com>
Reviewed-by: Yuan Yao <yuan.yao@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: bhelgaas@google.com
Cc: bp@suse.de
Cc: dan.j.williams@intel.com
Cc: jack@suse.cz
Cc: rdunlap@infradead.org
Cc: tiwai@suse.de
Link: http://lkml.kernel.org/r/20190305083432.23675-1-richardw.yang@linux.intel.com
[ Rewrote the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/iomem.c    |  4 ++--
 kernel/resource.c | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..93c264444510 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
  *
  * MEMREMAP_WB - matches the default mapping for System RAM on
  * the architecture.  This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
  * memremap() will bypass establishing a new mapping and instead return
  * a pointer into the direct map.
  *
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 	/* Try all mapping types requested until one returns non-NULL */
 	if (flags & MEMREMAP_WB) {
 		/*
-		 * MEMREMAP_WB is special in that it can be satisifed
+		 * MEMREMAP_WB is special in that it can be satisfied
 		 * from the direct map.  Some archs depend on the
 		 * capability of memremap() to autodetect cases where
 		 * the requested range is potentially in System RAM.
diff --git a/kernel/resource.c b/kernel/resource.c
index 92190f62ebc5..8c15f846e8ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
 int region_intersects(resource_size_t start, size_t size, unsigned long flags,
 		      unsigned long desc)
 {
-	resource_size_t end = start + size - 1;
+	struct resource res;
 	int type = 0; int other = 0;
 	struct resource *p;
 
+	res.start = start;
+	res.end = start + size - 1;
+
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
 		bool is_type = (((p->flags & flags) == flags) &&
 				((desc == IORES_DESC_NONE) ||
 				 (desc == p->desc)));
 
-		if (start >= p->start && start <= p->end)
-			is_type ? type++ : other++;
-		if (end >= p->start && end <= p->end)
-			is_type ? type++ : other++;
-		if (p->start >= start && p->end <= end)
+		if (resource_overlaps(p, &res))
 			is_type ? type++ : other++;
 	}
 	read_unlock(&resource_lock);
-- 
cgit v1.2.3


From 1a010e29cfa00fee2888fd2fd4983f848cbafb58 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 27 Feb 2019 11:10:17 +0300
Subject: sched/rt: Check integer overflow at usec to nsec conversion

Example of unhandled overflows:

 # echo 18446744073709651 > cpu.rt_runtime_us
 # cat cpu.rt_runtime_us
 99

 # echo 18446744073709900 > cpu.rt_period_us
 # cat cpu.rt_period_us
 348

After this patch they will fail with -EINVAL.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/155125501739.293431.5252197504404771496.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 90fa23d36565..1e6b909dca36 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
+	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+		return -EINVAL;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 
+	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+		return -EINVAL;
+
 	rt_period = rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
-- 
cgit v1.2.3


From 5b61d50ab4ef590f5e1d4df15cd2cea5f5715308 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 27 Feb 2019 11:10:18 +0300
Subject: sched/core: Handle overflow in cpu_shares_write_u64

Bit shift in scale_load() could overflow shares. This patch saturates
it to MAX_SHARES like following sched_group_set_shares().

Example:

 # echo 9223372036854776832 > cpu.shares
 # cat cpu.shares

Before patch: 1024
After pattch: 262144

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/155125501891.293431.3345233332801109696.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb09eaad1d3a..685b1541ce51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6507,6 +6507,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
+	if (shareval > scale_load_down(ULONG_MAX))
+		shareval = MAX_SHARES;
 	return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
 
-- 
cgit v1.2.3


From 1a8b4540db732ca16c9e43ac7c08b1b8f0b252d8 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 27 Feb 2019 11:10:20 +0300
Subject: sched/core: Check quota and period overflow at usec to nsec
 conversion

Large values could overflow u64 and pass following sanity checks.

 # echo 18446744073750000 > cpu.cfs_period_us
 # cat cpu.cfs_period_us
 40448

 # echo 18446744073750000 > cpu.cfs_quota_us
 # cat cpu.cfs_quota_us
 40448

After this patch they will fail with -EINVAL.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/155125502079.293431.3947497929372138600.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 685b1541ce51..de8ab411826c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6611,8 +6611,10 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 	period = ktime_to_ns(tg->cfs_bandwidth.period);
 	if (cfs_quota_us < 0)
 		quota = RUNTIME_INF;
-	else
+	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
 		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+	else
+		return -EINVAL;
 
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
@@ -6634,6 +6636,9 @@ static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
 	u64 quota, period;
 
+	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+		return -EINVAL;
+
 	period = (u64)cfs_period_us * NSEC_PER_USEC;
 	quota = tg->cfs_bandwidth.quota;
 
-- 
cgit v1.2.3


From 13e792a19d4e3a1c64e94197ba357685fd584ded Mon Sep 17 00:00:00 2001
From: Laurent Gauthier <laurent.gauthier@soccasys.com>
Date: Sat, 5 Jan 2019 00:07:45 +0100
Subject: tick: Fix typos in comments

Signed-off-by: Laurent Gauthier <laurent.gauthier@soccasys.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-broadcast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7541cbca695e..e51778c312f1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -809,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			 * either the CPU handling the broadcast
 			 * interrupt or we got woken by something else.
 			 *
-			 * We are not longer in the broadcast mask, so
+			 * We are no longer in the broadcast mask, so
 			 * if the cpu local expiry time is already
 			 * reached, we would reprogram the cpu local
 			 * timer with an already expired event.
 			 *
 			 * This can lead to a ping-pong when we return
-			 * to idle and therefor rearm the broadcast
+			 * to idle and therefore rearm the broadcast
 			 * timer before the cpu local timer was able
 			 * to fire. This happens because the forced
 			 * reprogramming makes sure that the event
-- 
cgit v1.2.3


From b6fbbf31d15b5072250ec6ed79e415a1160e5621 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Wed, 19 Dec 2018 14:34:44 +0100
Subject: cgroup/cpuset: Update stale generate_sched_domains() comments

Commit:

  fc560a26acce ("cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()")

removed the local list (q) that was used to perform a top-down scan
of all cpusets; however, comments mentioning it were not updated.

Update comments to reflect current implementation.

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: cgroups@vger.kernel.org
Cc: lizefan@huawei.com
Link: http://lkml.kernel.org/r/20181219133445.31982-1-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cgroup/cpuset.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4834c4214e9c..6a1942ed781c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
  * Must be called with cpuset_mutex held.
  *
  * The three key local variables below are:
- *    q  - a linked-list queue of cpuset pointers, used to implement a
- *	   top-down scan of all cpusets.  This scan loads a pointer
- *	   to each cpuset marked is_sched_load_balance into the
- *	   array 'csa'.  For our purposes, rebuilding the schedulers
- *	   sched domains, we can ignore !is_sched_load_balance cpusets.
+ *    cp - cpuset pointer, used (together with pos_css) to perform a
+ *	   top-down scan of all cpusets. For our purposes, rebuilding
+ *	   the schedulers sched domains, we can ignore !is_sched_load_
+ *	   balance cpusets.
  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
  *	   that need to be load balanced, for convenient iterative
  *	   access by the subsequent code that finds the best partition,
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
 static int generate_sched_domains(cpumask_var_t **domains,
 			struct sched_domain_attr **attributes)
 {
-	struct cpuset *cp;	/* scans q */
+	struct cpuset *cp;	/* top-down scan of cpusets */
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-- 
cgit v1.2.3


From cb0c04143b6196f4a479ba113706329fc667ee15 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Wed, 19 Dec 2018 14:34:45 +0100
Subject: sched/topology: Update init_sched_domains() comment

Holding hotplug lock is not a requirement anymore for callers of sched_
init_domains after commit:

  6acce3ef8452 ("sched: Remove get_online_cpus() usage")

Update the relative comment preceding init_sched_domains().

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: cgroups@vger.kernel.org
Cc: lizefan@huawei.com
Link: http://lkml.kernel.org/r/20181219133445.31982-2-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c65b31e9458b..f53f89df837d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2081,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 }
 
 /*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated CPUs, but could be used to
- * exclude other special cases in the future.
+ * Set up scheduler domains and groups.  For now this just excludes isolated
+ * CPUs, but could be used to exclude other special cases in the future.
  */
 int sched_init_domains(const struct cpumask *cpu_map)
 {
-- 
cgit v1.2.3


From 50943f3e136adfc421f9768d6ae09ba7b83aaefd Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:01 -0700
Subject: cgroup: rename freezer.c into legacy_freezer.c

Freezer.c will contain an implementation of cgroup v2 freezer,
so let's rename the v1 freezer to avoid naming conflicts.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
---
 kernel/cgroup/Makefile         |   2 +-
 kernel/cgroup/freezer.c        | 481 -----------------------------------------
 kernel/cgroup/legacy_freezer.c | 481 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 482 insertions(+), 482 deletions(-)
 delete mode 100644 kernel/cgroup/freezer.c
 create mode 100644 kernel/cgroup/legacy_freezer.c

(limited to 'kernel')

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index bfcdae896122..8d5689ca94b9 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
 
-obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
deleted file mode 100644
index 08236798d173..000000000000
--- a/kernel/cgroup/freezer.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * cgroup_freezer.c -  control group freezer subsystem
- *
- * Copyright IBM Corporation, 2007
- *
- * Author : Cedric Le Goater <clg@fr.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/freezer.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-/*
- * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
- * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
- * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
- * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
- * its ancestors has FREEZING_SELF set.
- */
-enum freezer_state_flags {
-	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */
-	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */
-	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */
-	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */
-
-	/* mask for all FREEZING flags */
-	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
-};
-
-struct freezer {
-	struct cgroup_subsys_state	css;
-	unsigned int			state;
-};
-
-static DEFINE_MUTEX(freezer_mutex);
-
-static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct freezer, css) : NULL;
-}
-
-static inline struct freezer *task_freezer(struct task_struct *task)
-{
-	return css_freezer(task_css(task, freezer_cgrp_id));
-}
-
-static struct freezer *parent_freezer(struct freezer *freezer)
-{
-	return css_freezer(freezer->css.parent);
-}
-
-bool cgroup_freezing(struct task_struct *task)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = task_freezer(task)->state & CGROUP_FREEZING;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static const char *freezer_state_strs(unsigned int state)
-{
-	if (state & CGROUP_FROZEN)
-		return "FROZEN";
-	if (state & CGROUP_FREEZING)
-		return "FREEZING";
-	return "THAWED";
-};
-
-static struct cgroup_subsys_state *
-freezer_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct freezer *freezer;
-
-	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
-	if (!freezer)
-		return ERR_PTR(-ENOMEM);
-
-	return &freezer->css;
-}
-
-/**
- * freezer_css_online - commit creation of a freezer css
- * @css: css being created
- *
- * We're committing to creation of @css.  Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
- */
-static int freezer_css_online(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-	struct freezer *parent = parent_freezer(freezer);
-
-	mutex_lock(&freezer_mutex);
-
-	freezer->state |= CGROUP_FREEZER_ONLINE;
-
-	if (parent && (parent->state & CGROUP_FREEZING)) {
-		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
-		atomic_inc(&system_freezing_cnt);
-	}
-
-	mutex_unlock(&freezer_mutex);
-	return 0;
-}
-
-/**
- * freezer_css_offline - initiate destruction of a freezer css
- * @css: css being destroyed
- *
- * @css is going away.  Mark it dead and decrement system_freezing_count if
- * it was holding one.
- */
-static void freezer_css_offline(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	mutex_lock(&freezer_mutex);
-
-	if (freezer->state & CGROUP_FREEZING)
-		atomic_dec(&system_freezing_cnt);
-
-	freezer->state = 0;
-
-	mutex_unlock(&freezer_mutex);
-}
-
-static void freezer_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css_freezer(css));
-}
-
-/*
- * Tasks can be migrated into a different freezer anytime regardless of its
- * current state.  freezer_attach() is responsible for making new tasks
- * conform to the current state.
- *
- * Freezer state changes and task migration are synchronized via
- * @freezer->lock.  freezer_attach() makes the new tasks conform to the
- * current state and all following state changes can see the new tasks.
- */
-static void freezer_attach(struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct cgroup_subsys_state *new_css;
-
-	mutex_lock(&freezer_mutex);
-
-	/*
-	 * Make the new tasks conform to the current state of @new_css.
-	 * For simplicity, when migrating any task to a FROZEN cgroup, we
-	 * revert it to FREEZING and let update_if_frozen() determine the
-	 * correct state later.
-	 *
-	 * Tasks in @tset are on @new_css but may not conform to its
-	 * current state before executing the following - !frozen tasks may
-	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
-	 */
-	cgroup_taskset_for_each(task, new_css, tset) {
-		struct freezer *freezer = css_freezer(new_css);
-
-		if (!(freezer->state & CGROUP_FREEZING)) {
-			__thaw_task(task);
-		} else {
-			freeze_task(task);
-			/* clear FROZEN and propagate upwards */
-			while (freezer && (freezer->state & CGROUP_FROZEN)) {
-				freezer->state &= ~CGROUP_FROZEN;
-				freezer = parent_freezer(freezer);
-			}
-		}
-	}
-
-	mutex_unlock(&freezer_mutex);
-}
-
-/**
- * freezer_fork - cgroup post fork callback
- * @task: a task which has just been forked
- *
- * @task has just been created and should conform to the current state of
- * the cgroup_freezer it belongs to.  This function may race against
- * freezer_attach().  Losing to freezer_attach() means that we don't have
- * to do anything as freezer_attach() will put @task into the appropriate
- * state.
- */
-static void freezer_fork(struct task_struct *task)
-{
-	struct freezer *freezer;
-
-	/*
-	 * The root cgroup is non-freezable, so we can skip locking the
-	 * freezer.  This is safe regardless of race with task migration.
-	 * If we didn't race or won, skipping is obviously the right thing
-	 * to do.  If we lost and root is the new cgroup, noop is still the
-	 * right thing to do.
-	 */
-	if (task_css_is_root(task, freezer_cgrp_id))
-		return;
-
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-
-	freezer = task_freezer(task);
-	if (freezer->state & CGROUP_FREEZING)
-		freeze_task(task);
-
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-}
-
-/**
- * update_if_frozen - update whether a cgroup finished freezing
- * @css: css of interest
- *
- * Once FREEZING is initiated, transition to FROZEN is lazily updated by
- * calling this function.  If the current state is FREEZING but not FROZEN,
- * this function checks whether all tasks of this cgroup and the descendant
- * cgroups finished freezing and, if so, sets FROZEN.
- *
- * The caller is responsible for grabbing RCU read lock and calling
- * update_if_frozen() on all descendants prior to invoking this function.
- *
- * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @css, so we can't verify task states against
- * @freezer state here.  See freezer_attach() for details.
- */
-static void update_if_frozen(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-	struct cgroup_subsys_state *pos;
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	lockdep_assert_held(&freezer_mutex);
-
-	if (!(freezer->state & CGROUP_FREEZING) ||
-	    (freezer->state & CGROUP_FROZEN))
-		return;
-
-	/* are all (live) children frozen? */
-	rcu_read_lock();
-	css_for_each_child(pos, css) {
-		struct freezer *child = css_freezer(pos);
-
-		if ((child->state & CGROUP_FREEZER_ONLINE) &&
-		    !(child->state & CGROUP_FROZEN)) {
-			rcu_read_unlock();
-			return;
-		}
-	}
-	rcu_read_unlock();
-
-	/* are all tasks frozen? */
-	css_task_iter_start(css, 0, &it);
-
-	while ((task = css_task_iter_next(&it))) {
-		if (freezing(task)) {
-			/*
-			 * freezer_should_skip() indicates that the task
-			 * should be skipped when determining freezing
-			 * completion.  Consider it frozen in addition to
-			 * the usual frozen condition.
-			 */
-			if (!frozen(task) && !freezer_should_skip(task))
-				goto out_iter_end;
-		}
-	}
-
-	freezer->state |= CGROUP_FROZEN;
-out_iter_end:
-	css_task_iter_end(&it);
-}
-
-static int freezer_read(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(m), *pos;
-
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-
-	/* update states bottom-up */
-	css_for_each_descendant_post(pos, css) {
-		if (!css_tryget_online(pos))
-			continue;
-		rcu_read_unlock();
-
-		update_if_frozen(pos);
-
-		rcu_read_lock();
-		css_put(pos);
-	}
-
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-
-	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
-	seq_putc(m, '\n');
-	return 0;
-}
-
-static void freeze_cgroup(struct freezer *freezer)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&freezer->css, 0, &it);
-	while ((task = css_task_iter_next(&it)))
-		freeze_task(task);
-	css_task_iter_end(&it);
-}
-
-static void unfreeze_cgroup(struct freezer *freezer)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&freezer->css, 0, &it);
-	while ((task = css_task_iter_next(&it)))
-		__thaw_task(task);
-	css_task_iter_end(&it);
-}
-
-/**
- * freezer_apply_state - apply state change to a single cgroup_freezer
- * @freezer: freezer to apply state change to
- * @freeze: whether to freeze or unfreeze
- * @state: CGROUP_FREEZING_* flag to set or clear
- *
- * Set or clear @state on @cgroup according to @freeze, and perform
- * freezing or thawing as necessary.
- */
-static void freezer_apply_state(struct freezer *freezer, bool freeze,
-				unsigned int state)
-{
-	/* also synchronizes against task migration, see freezer_attach() */
-	lockdep_assert_held(&freezer_mutex);
-
-	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
-		return;
-
-	if (freeze) {
-		if (!(freezer->state & CGROUP_FREEZING))
-			atomic_inc(&system_freezing_cnt);
-		freezer->state |= state;
-		freeze_cgroup(freezer);
-	} else {
-		bool was_freezing = freezer->state & CGROUP_FREEZING;
-
-		freezer->state &= ~state;
-
-		if (!(freezer->state & CGROUP_FREEZING)) {
-			if (was_freezing)
-				atomic_dec(&system_freezing_cnt);
-			freezer->state &= ~CGROUP_FROZEN;
-			unfreeze_cgroup(freezer);
-		}
-	}
-}
-
-/**
- * freezer_change_state - change the freezing state of a cgroup_freezer
- * @freezer: freezer of interest
- * @freeze: whether to freeze or thaw
- *
- * Freeze or thaw @freezer according to @freeze.  The operations are
- * recursive - all descendants of @freezer will be affected.
- */
-static void freezer_change_state(struct freezer *freezer, bool freeze)
-{
-	struct cgroup_subsys_state *pos;
-
-	/*
-	 * Update all its descendants in pre-order traversal.  Each
-	 * descendant will try to inherit its parent's FREEZING state as
-	 * CGROUP_FREEZING_PARENT.
-	 */
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-	css_for_each_descendant_pre(pos, &freezer->css) {
-		struct freezer *pos_f = css_freezer(pos);
-		struct freezer *parent = parent_freezer(pos_f);
-
-		if (!css_tryget_online(pos))
-			continue;
-		rcu_read_unlock();
-
-		if (pos_f == freezer)
-			freezer_apply_state(pos_f, freeze,
-					    CGROUP_FREEZING_SELF);
-		else
-			freezer_apply_state(pos_f,
-					    parent->state & CGROUP_FREEZING,
-					    CGROUP_FREEZING_PARENT);
-
-		rcu_read_lock();
-		css_put(pos);
-	}
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-}
-
-static ssize_t freezer_write(struct kernfs_open_file *of,
-			     char *buf, size_t nbytes, loff_t off)
-{
-	bool freeze;
-
-	buf = strstrip(buf);
-
-	if (strcmp(buf, freezer_state_strs(0)) == 0)
-		freeze = false;
-	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
-		freeze = true;
-	else
-		return -EINVAL;
-
-	freezer_change_state(css_freezer(of_css(of)), freeze);
-	return nbytes;
-}
-
-static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	return (bool)(freezer->state & CGROUP_FREEZING_SELF);
-}
-
-static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
-					struct cftype *cft)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
-}
-
-static struct cftype files[] = {
-	{
-		.name = "state",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = freezer_read,
-		.write = freezer_write,
-	},
-	{
-		.name = "self_freezing",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = freezer_self_freezing_read,
-	},
-	{
-		.name = "parent_freezing",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = freezer_parent_freezing_read,
-	},
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys freezer_cgrp_subsys = {
-	.css_alloc	= freezer_css_alloc,
-	.css_online	= freezer_css_online,
-	.css_offline	= freezer_css_offline,
-	.css_free	= freezer_css_free,
-	.attach		= freezer_attach,
-	.fork		= freezer_fork,
-	.legacy_cftypes	= files,
-};
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
new file mode 100644
index 000000000000..08236798d173
--- /dev/null
+++ b/kernel/cgroup/legacy_freezer.c
@@ -0,0 +1,481 @@
+/*
+ * cgroup_freezer.c -  control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */
+	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */
+	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */
+	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */
+
+	/* mask for all FREEZING flags */
+	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
+};
+
+struct freezer {
+	struct cgroup_subsys_state	css;
+	unsigned int			state;
+};
+
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct freezer, css) : NULL;
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+	return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+	return css_freezer(freezer->css.parent);
+}
+
+bool cgroup_freezing(struct task_struct *task)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = task_freezer(task)->state & CGROUP_FREEZING;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static const char *freezer_state_strs(unsigned int state)
+{
+	if (state & CGROUP_FROZEN)
+		return "FROZEN";
+	if (state & CGROUP_FREEZING)
+		return "FREEZING";
+	return "THAWED";
+};
+
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct freezer *freezer;
+
+	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+	if (!freezer)
+		return ERR_PTR(-ENOMEM);
+
+	return &freezer->css;
+}
+
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css.  Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+	struct freezer *parent = parent_freezer(freezer);
+
+	mutex_lock(&freezer_mutex);
+
+	freezer->state |= CGROUP_FREEZER_ONLINE;
+
+	if (parent && (parent->state & CGROUP_FREEZING)) {
+		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+		atomic_inc(&system_freezing_cnt);
+	}
+
+	mutex_unlock(&freezer_mutex);
+	return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away.  Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	mutex_lock(&freezer_mutex);
+
+	if (freezer->state & CGROUP_FREEZING)
+		atomic_dec(&system_freezing_cnt);
+
+	freezer->state = 0;
+
+	mutex_unlock(&freezer_mutex);
+}
+
+static void freezer_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_freezer(css));
+}
+
+/*
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state.  freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock.  freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
+ */
+static void freezer_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *new_css;
+
+	mutex_lock(&freezer_mutex);
+
+	/*
+	 * Make the new tasks conform to the current state of @new_css.
+	 * For simplicity, when migrating any task to a FROZEN cgroup, we
+	 * revert it to FREEZING and let update_if_frozen() determine the
+	 * correct state later.
+	 *
+	 * Tasks in @tset are on @new_css but may not conform to its
+	 * current state before executing the following - !frozen tasks may
+	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+	 */
+	cgroup_taskset_for_each(task, new_css, tset) {
+		struct freezer *freezer = css_freezer(new_css);
+
+		if (!(freezer->state & CGROUP_FREEZING)) {
+			__thaw_task(task);
+		} else {
+			freeze_task(task);
+			/* clear FROZEN and propagate upwards */
+			while (freezer && (freezer->state & CGROUP_FROZEN)) {
+				freezer->state &= ~CGROUP_FROZEN;
+				freezer = parent_freezer(freezer);
+			}
+		}
+	}
+
+	mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to.  This function may race against
+ * freezer_attach().  Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
+{
+	struct freezer *freezer;
+
+	/*
+	 * The root cgroup is non-freezable, so we can skip locking the
+	 * freezer.  This is safe regardless of race with task migration.
+	 * If we didn't race or won, skipping is obviously the right thing
+	 * to do.  If we lost and root is the new cgroup, noop is still the
+	 * right thing to do.
+	 */
+	if (task_css_is_root(task, freezer_cgrp_id))
+		return;
+
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	freezer = task_freezer(task);
+	if (freezer->state & CGROUP_FREEZING)
+		freeze_task(task);
+
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function.  If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here.  See freezer_attach() for details.
+ */
+static void update_if_frozen(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+	struct cgroup_subsys_state *pos;
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	lockdep_assert_held(&freezer_mutex);
+
+	if (!(freezer->state & CGROUP_FREEZING) ||
+	    (freezer->state & CGROUP_FROZEN))
+		return;
+
+	/* are all (live) children frozen? */
+	rcu_read_lock();
+	css_for_each_child(pos, css) {
+		struct freezer *child = css_freezer(pos);
+
+		if ((child->state & CGROUP_FREEZER_ONLINE) &&
+		    !(child->state & CGROUP_FROZEN)) {
+			rcu_read_unlock();
+			return;
+		}
+	}
+	rcu_read_unlock();
+
+	/* are all tasks frozen? */
+	css_task_iter_start(css, 0, &it);
+
+	while ((task = css_task_iter_next(&it))) {
+		if (freezing(task)) {
+			/*
+			 * freezer_should_skip() indicates that the task
+			 * should be skipped when determining freezing
+			 * completion.  Consider it frozen in addition to
+			 * the usual frozen condition.
+			 */
+			if (!frozen(task) && !freezer_should_skip(task))
+				goto out_iter_end;
+		}
+	}
+
+	freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+	css_task_iter_end(&it);
+}
+
+static int freezer_read(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	/* update states bottom-up */
+	css_for_each_descendant_post(pos, css) {
+		if (!css_tryget_online(pos))
+			continue;
+		rcu_read_unlock();
+
+		update_if_frozen(pos);
+
+		rcu_read_lock();
+		css_put(pos);
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+
+	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void freeze_cgroup(struct freezer *freezer)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&freezer->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		freeze_task(task);
+	css_task_iter_end(&it);
+}
+
+static void unfreeze_cgroup(struct freezer *freezer)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&freezer->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		__thaw_task(task);
+	css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+				unsigned int state)
+{
+	/* also synchronizes against task migration, see freezer_attach() */
+	lockdep_assert_held(&freezer_mutex);
+
+	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+		return;
+
+	if (freeze) {
+		if (!(freezer->state & CGROUP_FREEZING))
+			atomic_inc(&system_freezing_cnt);
+		freezer->state |= state;
+		freeze_cgroup(freezer);
+	} else {
+		bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+		freezer->state &= ~state;
+
+		if (!(freezer->state & CGROUP_FREEZING)) {
+			if (was_freezing)
+				atomic_dec(&system_freezing_cnt);
+			freezer->state &= ~CGROUP_FROZEN;
+			unfreeze_cgroup(freezer);
+		}
+	}
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze.  The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+	struct cgroup_subsys_state *pos;
+
+	/*
+	 * Update all its descendants in pre-order traversal.  Each
+	 * descendant will try to inherit its parent's FREEZING state as
+	 * CGROUP_FREEZING_PARENT.
+	 */
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+	css_for_each_descendant_pre(pos, &freezer->css) {
+		struct freezer *pos_f = css_freezer(pos);
+		struct freezer *parent = parent_freezer(pos_f);
+
+		if (!css_tryget_online(pos))
+			continue;
+		rcu_read_unlock();
+
+		if (pos_f == freezer)
+			freezer_apply_state(pos_f, freeze,
+					    CGROUP_FREEZING_SELF);
+		else
+			freezer_apply_state(pos_f,
+					    parent->state & CGROUP_FREEZING,
+					    CGROUP_FREEZING_PARENT);
+
+		rcu_read_lock();
+		css_put(pos);
+	}
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+}
+
+static ssize_t freezer_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	bool freeze;
+
+	buf = strstrip(buf);
+
+	if (strcmp(buf, freezer_state_strs(0)) == 0)
+		freeze = false;
+	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+		freeze = true;
+	else
+		return -EINVAL;
+
+	freezer_change_state(css_freezer(of_css(of)), freeze);
+	return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
+}
+
+static struct cftype files[] = {
+	{
+		.name = "state",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = freezer_read,
+		.write = freezer_write,
+	},
+	{
+		.name = "self_freezing",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = freezer_self_freezing_read,
+	},
+	{
+		.name = "parent_freezing",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = freezer_parent_freezing_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys freezer_cgrp_subsys = {
+	.css_alloc	= freezer_css_alloc,
+	.css_online	= freezer_css_online,
+	.css_offline	= freezer_css_offline,
+	.css_free	= freezer_css_free,
+	.attach		= freezer_attach,
+	.fork		= freezer_fork,
+	.legacy_cftypes	= files,
+};
-- 
cgit v1.2.3


From aade7f9efba098859681f8e88d81a5b44ad09b12 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:02 -0700
Subject: cgroup: implement __cgroup_task_count() helper

The helper is identical to the existing cgroup_task_count()
except it doesn't take the css_set_lock by itself, assuming
that the caller does.

Also, move cgroup_task_count() implementation into
kernel/cgroup/cgroup.c, as there is nothing specific to cgroup v1.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
---
 kernel/cgroup/cgroup-internal.h |  1 +
 kernel/cgroup/cgroup-v1.c       | 16 ----------------
 kernel/cgroup/cgroup.c          | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 30e39f3932ad..02c001ffe2e2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -240,6 +240,7 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 		     struct kernfs_root *kf_root);
 
+int __cgroup_task_count(const struct cgroup *cgrp);
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index c126b34fd4ff..68ca5de7ec27 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -342,22 +342,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 	return l;
 }
 
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- */
-int cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += link->cset->nr_tasks;
-	spin_unlock_irq(&css_set_lock);
-	return count;
-}
-
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f219c195a9a5..3008ea684aa0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp)
 	css_get(&cgrp->self);
 }
 
+/**
+ * __cgroup_task_count - count the number of tasks in a cgroup. The caller
+ * is responsible for taking the css_set_lock.
+ * @cgrp: the cgroup in question
+ */
+int __cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	lockdep_assert_held(&css_set_lock);
+
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += link->cset->nr_tasks;
+
+	return count;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count;
+
+	spin_lock_irq(&css_set_lock);
+	count = __cgroup_task_count(cgrp);
+	spin_unlock_irq(&css_set_lock);
+
+	return count;
+}
+
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
-- 
cgit v1.2.3


From 4dcabece4c3a9f9522127be12cc12cc120399b2f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:03 -0700
Subject: cgroup: protect cgroup->nr_(dying_)descendants by css_set_lock

The number of descendant cgroups and the number of dying
descendant cgroups are currently synchronized using the cgroup_mutex.

The number of descendant cgroups will be required by the cgroup v2
freezer, which will use it to determine if a cgroup is frozen
(depending on total number of descendants and number of frozen
descendants). It's not always acceptable to grab the cgroup_mutex,
especially from quite hot paths (e.g. exit()).

To avoid this, let's additionally synchronize these counters using
the css_set_lock.

So, it's safe to read these counters with either cgroup_mutex or
css_set_lock locked, and for changing both locks should be acquired.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
---
 kernel/cgroup/cgroup.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3008ea684aa0..786ceef2f222 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4811,9 +4811,11 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgroup_on_dfl(cgrp))
 			cgroup_rstat_flush(cgrp);
 
+		spin_lock_irq(&css_set_lock);
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
 			tcgrp->nr_dying_descendants--;
+		spin_unlock_irq(&css_set_lock);
 
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 		cgrp->id = -1;
@@ -5031,12 +5033,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_psi_free;
 
+	spin_lock_irq(&css_set_lock);
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
 		if (tcgrp != cgrp)
 			tcgrp->nr_descendants++;
 	}
+	spin_unlock_irq(&css_set_lock);
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5321,10 +5325,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	if (parent && cgroup_is_threaded(cgrp))
 		parent->nr_threaded_children--;
 
+	spin_lock_irq(&css_set_lock);
 	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		tcgrp->nr_descendants--;
 		tcgrp->nr_dying_descendants++;
 	}
+	spin_unlock_irq(&css_set_lock);
 
 	cgroup1_check_for_release(parent);
 
-- 
cgit v1.2.3


From 76f969e8948d82e78e1bc4beb6b9465908e74873 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:04 -0700
Subject: cgroup: cgroup v2 freezer

Cgroup v1 implements the freezer controller, which provides an ability
to stop the workload in a cgroup and temporarily free up some
resources (cpu, io, network bandwidth and, potentially, memory)
for some other tasks. Cgroup v2 lacks this functionality.

This patch implements freezer for cgroup v2.

Cgroup v2 freezer tries to put tasks into a state similar to jobctl
stop. This means that tasks can be killed, ptraced (using
PTRACE_SEIZE*), and interrupted. It is possible to attach to
a frozen task, get some information (e.g. read registers) and detach.
It's also possible to migrate a frozen tasks to another cgroup.

This differs cgroup v2 freezer from cgroup v1 freezer, which mostly
tried to imitate the system-wide freezer. However uninterruptible
sleep is fine when all tasks are going to be frozen (hibernation case),
it's not the acceptable state for some subset of the system.

Cgroup v2 freezer is not supporting freezing kthreads.
If a non-root cgroup contains kthread, the cgroup still can be frozen,
but the kthread will remain running, the cgroup will be shown
as non-frozen, and the notification will not be delivered.

* PTRACE_ATTACH is not working because non-fatal signal delivery
is blocked in frozen state.

There are some interface differences between cgroup v1 and cgroup v2
freezer too, which are required to conform the cgroup v2 interface
design principles:
1) There is no separate controller, which has to be turned on:
the functionality is always available and is represented by
cgroup.freeze and cgroup.events cgroup control files.
2) The desired state is defined by the cgroup.freeze control file.
Any hierarchical configuration is allowed.
3) The interface is asynchronous. The actual state is available
using cgroup.events control file ("frozen" field). There are no
dedicated transitional states.
4) It's allowed to make any changes with the cgroup hierarchy
(create new cgroups, remove old cgroups, move tasks between cgroups)
no matter if some cgroups are frozen.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
No-objection-from-me-by: Oleg Nesterov <oleg@redhat.com>
Cc: kernel-team@fb.com
---
 kernel/cgroup/Makefile  |   2 +-
 kernel/cgroup/cgroup.c  | 110 ++++++++++++++++-
 kernel/cgroup/freezer.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c           |   2 +
 kernel/signal.c         |  70 ++++++++++-
 5 files changed, 491 insertions(+), 10 deletions(-)
 create mode 100644 kernel/cgroup/freezer.c

(limited to 'kernel')

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 8d5689ca94b9..5d7a76bfbbb7 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 786ceef2f222..6895464b54c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2435,8 +2435,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
 			css_set_move_task(task, from_cset, to_cset, true);
-			put_css_set_locked(from_cset);
 			from_cset->nr_tasks--;
+			/*
+			 * If the source or destination cgroup is frozen,
+			 * the task might require to change its state.
+			 */
+			cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
+						    to_cset->dfl_cgrp);
+			put_css_set_locked(from_cset);
+
 		}
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -3477,8 +3484,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
 
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
-	seq_printf(seq, "populated %d\n",
-		   cgroup_is_populated(seq_css(seq)->cgroup));
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
+	seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
+
 	return 0;
 }
 
@@ -3540,6 +3550,40 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 }
 #endif
 
+static int cgroup_freeze_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	seq_printf(seq, "%d\n", cgrp->freezer.freeze);
+
+	return 0;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+				   char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+	ssize_t ret;
+	int freeze;
+
+	ret = kstrtoint(strstrip(buf), 0, &freeze);
+	if (ret)
+		return ret;
+
+	if (freeze < 0 || freeze > 1)
+		return -ERANGE;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	cgroup_freeze(cgrp, freeze);
+
+	cgroup_kn_unlock(of->kn);
+
+	return nbytes;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
@@ -4683,6 +4727,12 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
+	{
+		.name = "cgroup.freeze",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_freeze_show,
+		.write = cgroup_freeze_write,
+	},
 	{
 		.name = "cpu.stat",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -5033,12 +5083,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_psi_free;
 
+	/*
+	 * New cgroup inherits effective freeze counter, and
+	 * if the parent has to be frozen, the child has too.
+	 */
+	cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+	if (cgrp->freezer.e_freeze)
+		set_bit(CGRP_FROZEN, &cgrp->flags);
+
 	spin_lock_irq(&css_set_lock);
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
-		if (tcgrp != cgrp)
+		if (tcgrp != cgrp) {
 			tcgrp->nr_descendants++;
+
+			/*
+			 * If the new cgroup is frozen, all ancestor cgroups
+			 * get a new frozen descendant, but their state can't
+			 * change because of this.
+			 */
+			if (cgrp->freezer.e_freeze)
+				tcgrp->freezer.nr_frozen_descendants++;
+		}
 	}
 	spin_unlock_irq(&css_set_lock);
 
@@ -5329,6 +5396,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		tcgrp->nr_descendants--;
 		tcgrp->nr_dying_descendants++;
+		/*
+		 * If the dying cgroup is frozen, decrease frozen descendants
+		 * counters of ancestor cgroups.
+		 */
+		if (test_bit(CGRP_FROZEN, &cgrp->flags))
+			tcgrp->freezer.nr_frozen_descendants--;
 	}
 	spin_unlock_irq(&css_set_lock);
 
@@ -5782,6 +5855,29 @@ void cgroup_post_fork(struct task_struct *child)
 			cset->nr_tasks++;
 			css_set_move_task(child, NULL, cset, false);
 		}
+
+		/*
+		 * If the cgroup has to be frozen, the new task has too.
+		 * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
+		 * the task into the frozen state.
+		 */
+		if (unlikely(cgroup_task_freeze(child))) {
+			struct cgroup *cgrp;
+
+			spin_lock(&child->sighand->siglock);
+			WARN_ON_ONCE(child->frozen);
+			cgrp = cset->dfl_cgrp;
+			child->jobctl |= JOBCTL_TRAP_FREEZE;
+			spin_unlock(&child->sighand->siglock);
+
+			/*
+			 * Calling cgroup_update_frozen() isn't required here,
+			 * because it will be called anyway a bit later
+			 * from do_freezer_trap(). So we avoid cgroup's
+			 * transient switch from the frozen state and back.
+			 */
+		}
+
 		spin_unlock_irq(&css_set_lock);
 	}
 
@@ -5830,6 +5926,12 @@ void cgroup_exit(struct task_struct *tsk)
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
 		cset->nr_tasks--;
+
+		if (unlikely(cgroup_task_frozen(tsk)))
+			cgroup_freezer_frozen_exit(tsk);
+		else if (unlikely(cgroup_task_freeze(tsk)))
+			cgroup_update_frozen(task_dfl_cgroup(tsk));
+
 		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
new file mode 100644
index 000000000000..9d8cda478fc9
--- /dev/null
+++ b/kernel/cgroup/freezer.c
@@ -0,0 +1,317 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/cgroup.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+
+#include "cgroup-internal.h"
+
+/*
+ * Propagate the cgroup frozen state upwards by the cgroup tree.
+ */
+static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
+{
+	int desc = 1;
+
+	/*
+	 * If the new state is frozen, some freezing ancestor cgroups may change
+	 * their state too, depending on if all their descendants are frozen.
+	 *
+	 * Otherwise, all ancestor cgroups are forced into the non-frozen state.
+	 */
+	while ((cgrp = cgroup_parent(cgrp))) {
+		if (frozen) {
+			cgrp->freezer.nr_frozen_descendants += desc;
+			if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
+			    test_bit(CGRP_FREEZE, &cgrp->flags) &&
+			    cgrp->freezer.nr_frozen_descendants ==
+			    cgrp->nr_descendants) {
+				set_bit(CGRP_FROZEN, &cgrp->flags);
+				cgroup_file_notify(&cgrp->events_file);
+				desc++;
+			}
+		} else {
+			cgrp->freezer.nr_frozen_descendants -= desc;
+			if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
+				clear_bit(CGRP_FROZEN, &cgrp->flags);
+				cgroup_file_notify(&cgrp->events_file);
+				desc++;
+			}
+		}
+	}
+}
+
+/*
+ * Revisit the cgroup frozen state.
+ * Checks if the cgroup is really frozen and perform all state transitions.
+ */
+void cgroup_update_frozen(struct cgroup *cgrp)
+{
+	bool frozen;
+
+	lockdep_assert_held(&css_set_lock);
+
+	/*
+	 * If the cgroup has to be frozen (CGRP_FREEZE bit set),
+	 * and all tasks are frozen and/or stopped, let's consider
+	 * the cgroup frozen. Otherwise it's not frozen.
+	 */
+	frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
+		cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
+
+	if (frozen) {
+		/* Already there? */
+		if (test_bit(CGRP_FROZEN, &cgrp->flags))
+			return;
+
+		set_bit(CGRP_FROZEN, &cgrp->flags);
+	} else {
+		/* Already there? */
+		if (!test_bit(CGRP_FROZEN, &cgrp->flags))
+			return;
+
+		clear_bit(CGRP_FROZEN, &cgrp->flags);
+	}
+	cgroup_file_notify(&cgrp->events_file);
+
+	/* Update the state of ancestor cgroups. */
+	cgroup_propagate_frozen(cgrp, frozen);
+}
+
+/*
+ * Increment cgroup's nr_frozen_tasks.
+ */
+static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
+{
+	cgrp->freezer.nr_frozen_tasks++;
+}
+
+/*
+ * Decrement cgroup's nr_frozen_tasks.
+ */
+static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
+{
+	cgrp->freezer.nr_frozen_tasks--;
+	WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
+}
+
+/*
+ * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ */
+void cgroup_enter_frozen(void)
+{
+	struct cgroup *cgrp;
+
+	if (current->frozen)
+		return;
+
+	spin_lock_irq(&css_set_lock);
+	current->frozen = true;
+	cgrp = task_dfl_cgroup(current);
+	cgroup_inc_frozen_cnt(cgrp);
+	cgroup_update_frozen(cgrp);
+	spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Conditionally leave frozen/stopped state. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ *
+ * If always_leave is not set, and the cgroup is freezing,
+ * we're racing with the cgroup freezing. In this case, we don't
+ * drop the frozen counter to avoid a transient switch to
+ * the unfrozen state.
+ */
+void cgroup_leave_frozen(bool always_leave)
+{
+	struct cgroup *cgrp;
+
+	spin_lock_irq(&css_set_lock);
+	cgrp = task_dfl_cgroup(current);
+	if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
+		cgroup_dec_frozen_cnt(cgrp);
+		cgroup_update_frozen(cgrp);
+		WARN_ON_ONCE(!current->frozen);
+		current->frozen = false;
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	if (unlikely(current->frozen)) {
+		/*
+		 * If the task remained in the frozen state,
+		 * make sure it won't reach userspace without
+		 * entering the signal handling loop.
+		 */
+		spin_lock_irq(&current->sighand->siglock);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/*
+ * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
+ * jobctl bit.
+ */
+static void cgroup_freeze_task(struct task_struct *task, bool freeze)
+{
+	unsigned long flags;
+
+	/* If the task is about to die, don't bother with freezing it. */
+	if (!lock_task_sighand(task, &flags))
+		return;
+
+	if (freeze) {
+		task->jobctl |= JOBCTL_TRAP_FREEZE;
+		signal_wake_up(task, false);
+	} else {
+		task->jobctl &= ~JOBCTL_TRAP_FREEZE;
+		wake_up_process(task);
+	}
+
+	unlock_task_sighand(task, &flags);
+}
+
+/*
+ * Freeze or unfreeze all tasks in the given cgroup.
+ */
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	spin_lock_irq(&css_set_lock);
+	if (freeze)
+		set_bit(CGRP_FREEZE, &cgrp->flags);
+	else
+		clear_bit(CGRP_FREEZE, &cgrp->flags);
+	spin_unlock_irq(&css_set_lock);
+
+	css_task_iter_start(&cgrp->self, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		/*
+		 * Ignore kernel threads here. Freezing cgroups containing
+		 * kthreads isn't supported.
+		 */
+		if (task->flags & PF_KTHREAD)
+			continue;
+		cgroup_freeze_task(task, freeze);
+	}
+	css_task_iter_end(&it);
+
+	/*
+	 * Cgroup state should be revisited here to cover empty leaf cgroups
+	 * and cgroups which descendants are already in the desired state.
+	 */
+	spin_lock_irq(&css_set_lock);
+	if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
+		cgroup_update_frozen(cgrp);
+	spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Adjust the task state (freeze or unfreeze) and revisit the state of
+ * source and destination cgroups.
+ */
+void cgroup_freezer_migrate_task(struct task_struct *task,
+				 struct cgroup *src, struct cgroup *dst)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	/*
+	 * Kernel threads are not supposed to be frozen at all.
+	 */
+	if (task->flags & PF_KTHREAD)
+		return;
+
+	/*
+	 * Adjust counters of freezing and frozen tasks.
+	 * Note, that if the task is frozen, but the destination cgroup is not
+	 * frozen, we bump both counters to keep them balanced.
+	 */
+	if (task->frozen) {
+		cgroup_inc_frozen_cnt(dst);
+		cgroup_dec_frozen_cnt(src);
+	}
+	cgroup_update_frozen(dst);
+	cgroup_update_frozen(src);
+
+	/*
+	 * Force the task to the desired state.
+	 */
+	cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
+}
+
+void cgroup_freezer_frozen_exit(struct task_struct *task)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(task);
+
+	lockdep_assert_held(&css_set_lock);
+
+	cgroup_dec_frozen_cnt(cgrp);
+	cgroup_update_frozen(cgrp);
+}
+
+void cgroup_freeze(struct cgroup *cgrp, bool freeze)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *dsct;
+	bool applied = false;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/*
+	 * Nothing changed? Just exit.
+	 */
+	if (cgrp->freezer.freeze == freeze)
+		return;
+
+	cgrp->freezer.freeze = freeze;
+
+	/*
+	 * Propagate changes downwards the cgroup tree.
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		dsct = css->cgroup;
+
+		if (cgroup_is_dead(dsct))
+			continue;
+
+		if (freeze) {
+			dsct->freezer.e_freeze++;
+			/*
+			 * Already frozen because of ancestor's settings?
+			 */
+			if (dsct->freezer.e_freeze > 1)
+				continue;
+		} else {
+			dsct->freezer.e_freeze--;
+			/*
+			 * Still frozen because of ancestor's settings?
+			 */
+			if (dsct->freezer.e_freeze > 0)
+				continue;
+
+			WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+		}
+
+		/*
+		 * Do change actual state: freeze or unfreeze.
+		 */
+		cgroup_do_freeze(dsct, freeze);
+		applied = true;
+	}
+
+	/*
+	 * Even if the actual state hasn't changed, let's notify a user.
+	 * The state can be enforced by an ancestor cgroup: the cgroup
+	 * can already be in the desired state or it can be locked in the
+	 * opposite state, so that the transition will never happen.
+	 * In both cases it's better to notify a user, that there is
+	 * nothing to wait for.
+	 */
+	if (!applied)
+		cgroup_file_notify(&cgrp->events_file);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..8097a0cce4db 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1222,7 +1222,9 @@ static int wait_for_vfork_done(struct task_struct *child,
 	int killed;
 
 	freezer_do_not_count();
+	cgroup_enter_frozen();
 	killed = wait_for_completion_killable(vfork);
+	cgroup_leave_frozen(false);
 	freezer_count();
 
 	if (killed) {
diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..095e0fc57b25 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -43,6 +43,7 @@
 #include <linux/compiler.h>
 #include <linux/posix-timers.h>
 #include <linux/livepatch.h>
+#include <linux/cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 static bool recalc_sigpending_tsk(struct task_struct *t)
 {
-	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
+	if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
 	    PENDING(&t->pending, &t->blocked) ||
-	    PENDING(&t->signal->shared_pending, &t->blocked)) {
+	    PENDING(&t->signal->shared_pending, &t->blocked) ||
+	    cgroup_task_frozen(t)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
 		return true;
 	}
@@ -2108,6 +2110,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		preempt_disable();
 		read_unlock(&tasklist_lock);
 		preempt_enable_no_resched();
+		cgroup_enter_frozen();
 		freezable_schedule();
 	} else {
 		/*
@@ -2286,6 +2289,7 @@ static bool do_signal_stop(int signr)
 		}
 
 		/* Now we don't run again until woken by SIGCONT or SIGKILL */
+		cgroup_enter_frozen();
 		freezable_schedule();
 		return true;
 	} else {
@@ -2332,6 +2336,43 @@ static void do_jobctl_trap(void)
 	}
 }
 
+/**
+ * do_freezer_trap - handle the freezer jobctl trap
+ *
+ * Puts the task into frozen state, if only the task is not about to quit.
+ * In this case it drops JOBCTL_TRAP_FREEZE.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held,
+ * which is always released before returning.
+ */
+static void do_freezer_trap(void)
+	__releases(&current->sighand->siglock)
+{
+	/*
+	 * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
+	 * let's make another loop to give it a chance to be handled.
+	 * In any case, we'll return back.
+	 */
+	if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
+	     JOBCTL_TRAP_FREEZE) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return;
+	}
+
+	/*
+	 * Now we're sure that there is no pending fatal signal and no
+	 * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
+	 * immediately (if there is a non-fatal signal pending), and
+	 * put the task into sleep.
+	 */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	clear_thread_flag(TIF_SIGPENDING);
+	spin_unlock_irq(&current->sighand->siglock);
+	cgroup_enter_frozen();
+	freezable_schedule();
+}
+
 static int ptrace_signal(int signr, kernel_siginfo_t *info)
 {
 	/*
@@ -2442,6 +2483,10 @@ relock:
 		ksig->info.si_signo = signr = SIGKILL;
 		sigdelset(&current->pending.signal, SIGKILL);
 		recalc_sigpending();
+		current->jobctl &= ~JOBCTL_TRAP_FREEZE;
+		spin_unlock_irq(&sighand->siglock);
+		if (unlikely(cgroup_task_frozen(current)))
+			cgroup_leave_frozen(true);
 		goto fatal;
 	}
 
@@ -2452,9 +2497,24 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
-		if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
-			do_jobctl_trap();
+		if (unlikely(current->jobctl &
+			     (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
+			if (current->jobctl & JOBCTL_TRAP_MASK) {
+				do_jobctl_trap();
+				spin_unlock_irq(&sighand->siglock);
+			} else if (current->jobctl & JOBCTL_TRAP_FREEZE)
+				do_freezer_trap();
+
+			goto relock;
+		}
+
+		/*
+		 * If the task is leaving the frozen state, let's update
+		 * cgroup counters and reset the frozen bit.
+		 */
+		if (unlikely(cgroup_task_frozen(current))) {
 			spin_unlock_irq(&sighand->siglock);
+			cgroup_leave_frozen(true);
 			goto relock;
 		}
 
@@ -2548,8 +2608,8 @@ relock:
 			continue;
 		}
 
-	fatal:
 		spin_unlock_irq(&sighand->siglock);
+	fatal:
 
 		/*
 		 * Anything else is fatal, maybe with a core dump.
-- 
cgit v1.2.3


From 712e35178754bbb785d00d5fcf5abaf32699bf11 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:07 -0700
Subject: cgroup: make TRACE_CGROUP_PATH irq-safe

To use the TRACE_CGROUP_PATH() macro with css_set_lock
locked, let's make the macro irq-safe.
It's necessary in order to trace cgroup freezer state
transitions (frozen/not frozen), which are happening
with css_set_lock locked.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup-internal.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 02c001ffe2e2..809e34a3c017 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void);
 #define TRACE_CGROUP_PATH(type, cgrp, ...)				\
 	do {								\
 		if (trace_cgroup_##type##_enabled()) {			\
-			spin_lock(&trace_cgroup_path_lock);		\
+			unsigned long flags;				\
+			spin_lock_irqsave(&trace_cgroup_path_lock,	\
+					  flags);			\
 			cgroup_path(cgrp, trace_cgroup_path,		\
 				    TRACE_CGROUP_PATH_LEN);		\
 			trace_cgroup_##type(cgrp, trace_cgroup_path,	\
 					    ##__VA_ARGS__);		\
-			spin_unlock(&trace_cgroup_path_lock);		\
+			spin_unlock_irqrestore(&trace_cgroup_path_lock, \
+					       flags);			\
 		}							\
 	} while (0)
 
-- 
cgit v1.2.3


From 4c476d8cff48853645abc822154aaad208faebcc Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 19 Apr 2019 10:03:08 -0700
Subject: cgroup: add tracing points for cgroup v2 freezer

Add cgroup:cgroup_freeze and cgroup:cgroup_unfreeze events,
which are using the existing cgroup tracing infrastructure.

Add the cgroup_event event class, which is similar to the cgroup
class, but contains an additional integer field to store a new
value (the level field is dropped).

Also add two tracing events: cgroup_notify_populated and
cgroup_notify_frozen, which are raised in a generic way using
the TRACE_CGROUP_PATH() macro.

This allows to trace cgroup state transitions and is generally
helpful for debugging the cgroup freezer code.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c  |  2 ++
 kernel/cgroup/freezer.c | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6895464b54c6..57edcf398d71 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -816,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 			break;
 
 		cgroup1_check_for_release(cgrp);
+		TRACE_CGROUP_PATH(notify_populated, cgrp,
+				  cgroup_is_populated(cgrp));
 		cgroup_file_notify(&cgrp->events_file);
 
 		child = cgrp;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 9d8cda478fc9..3bfbb3c8baf3 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -6,6 +6,8 @@
 
 #include "cgroup-internal.h"
 
+#include <trace/events/cgroup.h>
+
 /*
  * Propagate the cgroup frozen state upwards by the cgroup tree.
  */
@@ -28,6 +30,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
 			    cgrp->nr_descendants) {
 				set_bit(CGRP_FROZEN, &cgrp->flags);
 				cgroup_file_notify(&cgrp->events_file);
+				TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
 				desc++;
 			}
 		} else {
@@ -35,6 +38,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
 			if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
 				clear_bit(CGRP_FROZEN, &cgrp->flags);
 				cgroup_file_notify(&cgrp->events_file);
+				TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
 				desc++;
 			}
 		}
@@ -73,6 +77,7 @@ void cgroup_update_frozen(struct cgroup *cgrp)
 		clear_bit(CGRP_FROZEN, &cgrp->flags);
 	}
 	cgroup_file_notify(&cgrp->events_file);
+	TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
 
 	/* Update the state of ancestor cgroups. */
 	cgroup_propagate_frozen(cgrp, frozen);
@@ -189,6 +194,11 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
 		clear_bit(CGRP_FREEZE, &cgrp->flags);
 	spin_unlock_irq(&css_set_lock);
 
+	if (freeze)
+		TRACE_CGROUP_PATH(freeze, cgrp);
+	else
+		TRACE_CGROUP_PATH(unfreeze, cgrp);
+
 	css_task_iter_start(&cgrp->self, 0, &it);
 	while ((task = css_task_iter_next(&it))) {
 		/*
@@ -312,6 +322,9 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 	 * In both cases it's better to notify a user, that there is
 	 * nothing to wait for.
 	 */
-	if (!applied)
+	if (!applied) {
+		TRACE_CGROUP_PATH(notify_frozen, cgrp,
+				  test_bit(CGRP_FROZEN, &cgrp->flags));
 		cgroup_file_notify(&cgrp->events_file);
+	}
 }
-- 
cgit v1.2.3


From ad2e379def135ebc079f89a0e0b1d987d243f949 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 28 Nov 2018 15:23:50 +0000
Subject: sched/debug: Fix spelling mistake "logaritmic" -> "logarithmic"

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: http://lkml.kernel.org/r/20181128152350.13622-1-colin.king@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8039d62ae36e..678bfb9bd87f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -702,7 +702,7 @@ do {									\
 
 static const char *sched_tunable_scaling_names[] = {
 	"none",
-	"logaritmic",
+	"logarithmic",
 	"linear"
 };
 
-- 
cgit v1.2.3


From 52fde6e70cccc2fcf3f39fed0d0392960e2c2b03 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 21 Apr 2019 19:40:44 -0400
Subject: function_graph: Have selftest also emulate tr->reset() as it did with
 tr->init()

The function_graph boot up self test emulates the tr->init() function in
order to add a wrapper around the function graph tracer entry code to test
for lock ups and such. But it does not emulate the tr->reset(), and just
calls the function_graph tracer tr->reset() function which will use its own
fgraph_ops to unregister function tracing with. As the fgraph_ops is
becoming more meaningful with the register_ftrace_graph() and
unregister_ftrace_graph() functions, the two need to be the same. The
emulated tr->init() uses its own fgraph_ops descriptor, which means the
unregister_ftrace_graph() must use the same ftrace_ops, which the selftest
currently does not do. By emulating the tr->reset() as the selftest does
with the tr->init() it will be able to pass the same fgraph_ops descriptor
to the unregister_ftrace_graph() as it did with the register_ftrace_graph().

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9d402e7fc949..69ee8ef12cee 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	/* check the trace buffer */
 	ret = trace_test_buffer(&tr->trace_buffer, &count);
 
-	trace->reset(tr);
+	/* Need to also simulate the tr->reset to remove this fgraph_ops */
+	tracing_stop_cmdline_record();
+	unregister_ftrace_graph(&fgraph_ops);
+
 	tracing_start();
 
 	if (!ret && !count) {
-- 
cgit v1.2.3


From 70c4cf17e445264453bc5323db3e50aa0ac9e81f Mon Sep 17 00:00:00 2001
From: Wenwen Wang <wang6495@umn.edu>
Date: Fri, 19 Apr 2019 20:49:29 -0500
Subject: audit: fix a memory leak bug

In audit_rule_change(), audit_data_to_entry() is firstly invoked to
translate the payload data to the kernel's rule representation. In
audit_data_to_entry(), depending on the audit field type, an audit tree may
be created in audit_make_tree(), which eventually invokes kmalloc() to
allocate the tree.  Since this tree is a temporary tree, it will be then
freed in the following execution, e.g., audit_add_rule() if the message
type is AUDIT_ADD_RULE or audit_del_rule() if the message type is
AUDIT_DEL_RULE. However, if the message type is neither AUDIT_ADD_RULE nor
AUDIT_DEL_RULE, i.e., the default case of the switch statement, this
temporary tree is not freed.

To fix this issue, only allocate the tree when the type is AUDIT_ADD_RULE
or AUDIT_DEL_RULE.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 2c3c2f349b23..1bc6410413e6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1114,22 +1114,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
 	int err = 0;
 	struct audit_entry *entry;
 
-	entry = audit_data_to_entry(data, datasz);
-	if (IS_ERR(entry))
-		return PTR_ERR(entry);
-
 	switch (type) {
 	case AUDIT_ADD_RULE:
+		entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
 		err = audit_add_rule(entry);
 		audit_log_rule_change("add_rule", &entry->rule, !err);
 		break;
 	case AUDIT_DEL_RULE:
+		entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
 		err = audit_del_rule(entry);
 		audit_log_rule_change("remove_rule", &entry->rule, !err);
 		break;
 	default:
-		err = -EINVAL;
 		WARN_ON(1);
+		return -EINVAL;
 	}
 
 	if (err || type == AUDIT_DEL_RULE) {
-- 
cgit v1.2.3


From 7df737e991069d75eec1ded1c8b37e81b8c54df9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 19 Apr 2019 07:44:54 -0700
Subject: bpf: remove global variables

Move three global variables protected by bpf_verifier_lock into
'struct bpf_verifier_env' to allow parallel verification.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index db301e9b5295..5f0eb5bd5589 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5369,10 +5369,6 @@ enum {
 
 #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
 
-static int *insn_stack;	/* stack of insns to process */
-static int cur_stack;	/* current stack index */
-static int *insn_state;
-
 /* t, w, e - match pseudo-code above:
  * t - index of current instruction
  * w - next instruction
@@ -5380,6 +5376,9 @@ static int *insn_state;
  */
 static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
+	int *insn_stack = env->cfg.insn_stack;
+	int *insn_state = env->cfg.insn_state;
+
 	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
 		return 0;
 
@@ -5400,9 +5399,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		/* tree-edge */
 		insn_state[t] = DISCOVERED | e;
 		insn_state[w] = DISCOVERED;
-		if (cur_stack >= env->prog->len)
+		if (env->cfg.cur_stack >= env->prog->len)
 			return -E2BIG;
-		insn_stack[cur_stack++] = w;
+		insn_stack[env->cfg.cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
 		verbose_linfo(env, t, "%d: ", t);
@@ -5426,14 +5425,15 @@ static int check_cfg(struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insns = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
+	int *insn_stack, *insn_state;
 	int ret = 0;
 	int i, t;
 
-	insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
 		return -ENOMEM;
 
-	insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_stack) {
 		kvfree(insn_state);
 		return -ENOMEM;
@@ -5441,12 +5441,12 @@ static int check_cfg(struct bpf_verifier_env *env)
 
 	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
 	insn_stack[0] = 0; /* 0 is the first instruction */
-	cur_stack = 1;
+	env->cfg.cur_stack = 1;
 
 peek_stack:
-	if (cur_stack == 0)
+	if (env->cfg.cur_stack == 0)
 		goto check_state;
-	t = insn_stack[cur_stack - 1];
+	t = insn_stack[env->cfg.cur_stack - 1];
 
 	if (BPF_CLASS(insns[t].code) == BPF_JMP ||
 	    BPF_CLASS(insns[t].code) == BPF_JMP32) {
@@ -5515,7 +5515,7 @@ peek_stack:
 
 mark_explored:
 	insn_state[t] = EXPLORED;
-	if (cur_stack-- <= 0) {
+	if (env->cfg.cur_stack-- <= 0) {
 		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
@@ -5535,6 +5535,7 @@ check_state:
 err_free:
 	kvfree(insn_state);
 	kvfree(insn_stack);
+	env->cfg.insn_state = env->cfg.insn_stack = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 45a73c17bfb92c3ceebedc80a750ef2c2931c26b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 19 Apr 2019 07:44:55 -0700
Subject: bpf: drop bpf_verifier_lock

Drop bpf_verifier_lock for root to avoid being DoS-ed by unprivileged.
The BPF verifier is now fully parallel.
All unpriv users are still serialized by bpf_verifier_lock to avoid
exhausting kernel memory by running N parallel verifications.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f0eb5bd5589..423f242a5efb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8132,9 +8132,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
+	is_priv = capable(CAP_SYS_ADMIN);
 
 	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
+	if (!is_priv)
+		mutex_lock(&bpf_verifier_lock);
 
 	if (attr->log_level || attr->log_buf || attr->log_size) {
 		/* user requested verbose verifier output
@@ -8157,7 +8159,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
-	is_priv = capable(CAP_SYS_ADMIN);
 	env->allow_ptr_leaks = is_priv;
 
 	ret = replace_map_fd_with_map_ptr(env);
@@ -8270,7 +8271,8 @@ err_release_maps:
 		release_maps(env);
 	*prog = env->prog;
 err_unlock:
-	mutex_unlock(&bpf_verifier_lock);
+	if (!is_priv)
+		mutex_unlock(&bpf_verifier_lock);
 	vfree(env->insn_aux_data);
 err_free_env:
 	kfree(env);
-- 
cgit v1.2.3


From 6beff00b79ca0b5caf0ce6fb8e11f57311bd95f8 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Wed, 6 Mar 2019 13:14:12 -0700
Subject: seccomp: fix up grammar in comment

This sentence is kind of a train wreck anyway, but at least dropping the
extra pronoun helps somewhat.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..503d02896c5d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,
  * Expects sighand and cred_guard_mutex locks to be held.
  *
  * Returns 0 on success, -ve on error, or the pid of a thread which was
- * either not in the correct seccomp mode or it did not have an ancestral
+ * either not in the correct seccomp mode or did not have an ancestral
  * seccomp filter.
  */
 static inline pid_t seccomp_can_sync_threads(void)
-- 
cgit v1.2.3


From 148a97d5a02a62f81b5d6176f871c94a65e1f3af Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 24 Apr 2019 17:24:37 +0300
Subject: dma-mapping: remove an unnecessary NULL check

We already dereferenced "dev" when we called get_dma_ops() so this NULL
check is too late.  We're not supposed to pass NULL "dev" pointers to
dma_alloc_attrs().

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/mapping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 685a53f2a793..f7afdadb6770 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -244,7 +244,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	void *cpu_addr;
 
-	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
+	WARN_ON_ONCE(!dev->coherent_dma_mask);
 
 	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
 		return cpu_addr;
-- 
cgit v1.2.3


From 877b5691f27a1aec0d9b53095a323e45c30069e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 14 Apr 2019 17:37:09 -0700
Subject: crypto: shash - remove shash_desc::flags

The flags field in 'struct shash_desc' never actually does anything.
The only ostensibly supported flag is CRYPTO_TFM_REQ_MAY_SLEEP.
However, no shash algorithm ever sleeps, making this flag a no-op.

With this being the case, inevitably some users who can't sleep wrongly
pass MAY_SLEEP.  These would all need to be fixed if any shash algorithm
actually started sleeping.  For example, the shash_ahash_*() functions,
which wrap a shash algorithm with the ahash API, pass through MAY_SLEEP
from the ahash API to the shash API.  However, the shash functions are
called under kmap_atomic(), so actually they're assumed to never sleep.

Even if it turns out that some users do need preemption points while
hashing large buffers, we could easily provide a helper function
crypto_shash_update_large() which divides the data into smaller chunks
and calls crypto_shash_update() and cond_resched() for each chunk.  It's
not necessary to have a flag in 'struct shash_desc', nor is it necessary
to make individual shash algorithms aware of this at all.

Therefore, remove shash_desc::flags, and document that the
crypto_shash_*() functions can be called from any context.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/kexec_file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1d0e00a3971..f7fb8f6a688f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image)
 		goto out_free_desc;
 
 	desc->tfm   = tfm;
-	desc->flags = 0;
 
 	ret = crypto_shash_init(desc);
 	if (ret < 0)
-- 
cgit v1.2.3


From 52ba92f5882adf1ee785c4c5ef23491948917fcd Mon Sep 17 00:00:00 2001
From: Kimberly Brown <kimbrownkd@gmail.com>
Date: Mon, 1 Apr 2019 22:51:41 -0400
Subject: irqdesc: Replace irq_kobj_type's default_attrs field with groups

The kobj_type default_attrs field is being replaced by the
default_groups field. Replace irq_kobj_type's default_attrs field with
default_groups and use the ATTRIBUTE_GROUPS macro to create irq_groups.

This patch was tested by verifying that the sysfs files for the
attributes in the default groups were created.

Signed-off-by: Kimberly Brown <kimbrownkd@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/irq/irqdesc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 13539e12cd80..bbec57bda666 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = {
 	&actions_attr.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(irq);
 
 static struct kobj_type irq_kobj_type = {
 	.release	= irq_kobj_release,
 	.sysfs_ops	= &kobj_sysfs_ops,
-	.default_attrs	= irq_attrs,
+	.default_groups = irq_groups,
 };
 
 static void irq_sysfs_add(int irq, struct irq_desc *desc)
-- 
cgit v1.2.3


From 2064fbc779d43c5ac38e20a4b4979e365f87349f Mon Sep 17 00:00:00 2001
From: Kimberly Brown <kimbrownkd@gmail.com>
Date: Mon, 1 Apr 2019 22:51:47 -0400
Subject: padata: Replace padata_attr_type default_attrs field with groups

The kobj_type default_attrs field is being replaced by the
default_groups field. Replace padata_attr_type's default_attrs field
with default_groups and use the ATTRIBUTE_GROUPS macro to create
padata_default_groups.

This patch was tested by loading the pcrypt module and verifying that
the sysfs files for the attributes in the default groups were created.

Signed-off-by: Kimberly Brown <kimbrownkd@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/padata.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 3e2633ae3bca..2d2fddbb7a4c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = {
 	&parallel_cpumask_attr.attr,
 	NULL,
 };
+ATTRIBUTE_GROUPS(padata_default);
 
 static ssize_t padata_sysfs_show(struct kobject *kobj,
 				 struct attribute *attr, char *buf)
@@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
 
 static struct kobj_type padata_attr_type = {
 	.sysfs_ops = &padata_sysfs_ops,
-	.default_attrs = padata_default_attrs,
+	.default_groups = padata_default_groups,
 	.release = padata_sysfs_release,
 };
 
-- 
cgit v1.2.3


From 9782adeb3d9d6e33fc52392031b8e00270515442 Mon Sep 17 00:00:00 2001
From: Kimberly Brown <kimbrownkd@gmail.com>
Date: Mon, 1 Apr 2019 22:51:53 -0400
Subject: cpufreq: schedutil: Replace default_attrs field with groups

The kobj_type default_attrs field is being replaced by the
default_groups field. Replace sugov_tunables_ktype's default_attrs field
with default groups. Change "sugov_attributes" to "sugov_attrs" and use
the ATTRIBUTE_GROUPS macro to create sugov_groups.

This patch was tested by setting the scaling governor to schedutil and
verifying that the sysfs files for the attributes in the default groups
were created.

Signed-off-by: Kimberly Brown <kimbrownkd@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/cpufreq_schedutil.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5c41ea367422..148b60c8993d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -598,13 +598,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
 
 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
 
-static struct attribute *sugov_attributes[] = {
+static struct attribute *sugov_attrs[] = {
 	&rate_limit_us.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(sugov);
 
 static struct kobj_type sugov_tunables_ktype = {
-	.default_attrs = sugov_attributes,
+	.default_groups = sugov_groups,
 	.sysfs_ops = &governor_sysfs_ops,
 };
 
-- 
cgit v1.2.3


From 70283454c918f1d65de0ec50c45ef592d781bcae Mon Sep 17 00:00:00 2001
From: Kimberly Brown <kimbrownkd@gmail.com>
Date: Mon, 1 Apr 2019 22:51:58 -0400
Subject: livepatch: Replace klp_ktype_patch's default_attrs with groups

The kobj_type default_attrs field is being replaced by the
default_groups field. Replace klp_ktype_patch's default_attrs field
with default_groups and use the ATTRIBUTE_GROUPS macro to create
klp_patch_groups.

This patch was tested by loading the livepatch-sample module and
verifying that the sysfs files for the attributes in the default groups
were created.

Signed-off-by: Kimberly Brown <kimbrownkd@gmail.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/livepatch/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb0ee10a1981..34a8338657d2 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = {
 	&force_kobj_attr.attr,
 	NULL
 };
+ATTRIBUTE_GROUPS(klp_patch);
 
 static void klp_free_object_dynamic(struct klp_object *obj)
 {
@@ -546,7 +547,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
 static struct kobj_type klp_ktype_patch = {
 	.release = klp_kobj_release_patch,
 	.sysfs_ops = &kobj_sysfs_ops,
-	.default_attrs = klp_patch_attrs,
+	.default_groups = klp_patch_groups,
 };
 
 static void klp_kobj_release_object(struct kobject *kobj)
-- 
cgit v1.2.3


From 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 25 Apr 2019 14:37:23 -0700
Subject: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type

target_fd is target namespace. If there is a flow dissector BPF program
attached to that namespace, its (single) id is returned.

v5:
* drop net ref right after rcu unlock (Daniel Borkmann)

v4:
* add missing put_net (Jann Horn)

v3:
* add missing inline to skb_flow_dissector_prog_query static def
  (kbuild test robot <lkp@intel.com>)

v2:
* don't sleep in rcu critical section (Jakub Kicinski)
* check input prog_cnt (exit early)

Cc: Jann Horn <jannh@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 92c9b8a32b50..b0de49598341 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2009,6 +2009,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 		break;
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
+	case BPF_FLOW_DISSECTOR:
+		return skb_flow_dissector_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 6b4f4bc9cb22875f97023984a625386f0c7cc1c0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2019 11:58:08 +0000
Subject: locking/futex: Allow low-level atomic operations to return -EAGAIN

Some futex() operations, including FUTEX_WAKE_OP, require the kernel to
perform an atomic read-modify-write of the futex word via the userspace
mapping. These operations are implemented by each architecture in
arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
are called in atomic context with the relevant hash bucket locks held.

Although these routines may return -EFAULT in response to a page fault
generated when accessing userspace, they are expected to succeed (i.e.
return 0) in all other cases. This poses a problem for architectures
that do not provide bounded forward progress guarantees or fairness of
contended atomic operations and can lead to starvation in some cases.

In these problematic scenarios, we must return back to the core futex
code so that we can drop the hash bucket locks and reschedule if
necessary, much like we do in the case of a page fault.

Allow architectures to return -EAGAIN from their implementations of
arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
will cause the core futex code to reschedule if necessary and return
back to the architecture code later on.

Cc: <stable@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/futex.c | 188 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 117 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 9e40cf7be606..6262f1534ac9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
 
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
+	int err;
 	u32 uninitialized_var(curval);
 
 	if (unlikely(should_fail_futex(true)))
 		return -EFAULT;
 
-	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-		return -EFAULT;
+	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+	if (unlikely(err))
+		return err;
 
 	/* If user space value changed, let the caller retry */
 	return curval != uval ? -EAGAIN : 0;
@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
 	if (unlikely(should_fail_futex(true)))
 		ret = -EFAULT;
 
-	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
-		ret = -EFAULT;
-
-	} else if (curval != uval) {
+	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+	if (!ret && (curval != uval)) {
 		/*
 		 * If a unconditional UNLOCK_PI operation (user space did not
 		 * try the TID->0 transition) raced with a waiter setting the
@@ -1700,32 +1700,32 @@ retry_private:
 	double_lock_hb(hb1, hb2);
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-
 		double_unlock_hb(hb1, hb2);
 
-#ifndef CONFIG_MMU
-		/*
-		 * we don't get EFAULT from MMU faults if we don't have an MMU,
-		 * but we might get them from range checking
-		 */
-		ret = op_ret;
-		goto out_put_keys;
-#endif
-
-		if (unlikely(op_ret != -EFAULT)) {
+		if (!IS_ENABLED(CONFIG_MMU) ||
+		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+			/*
+			 * we don't get EFAULT from MMU faults if we don't have
+			 * an MMU, but we might get them from range checking
+			 */
 			ret = op_ret;
 			goto out_put_keys;
 		}
 
-		ret = fault_in_user_writeable(uaddr2);
-		if (ret)
-			goto out_put_keys;
+		if (op_ret == -EFAULT) {
+			ret = fault_in_user_writeable(uaddr2);
+			if (ret)
+				goto out_put_keys;
+		}
 
-		if (!(flags & FLAGS_SHARED))
+		if (!(flags & FLAGS_SHARED)) {
+			cond_resched();
 			goto retry_private;
+		}
 
 		put_futex_key(&key2);
 		put_futex_key(&key1);
+		cond_resched();
 		goto retry;
 	}
 
@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	u32 uval, uninitialized_var(curval), newval;
 	struct task_struct *oldowner, *newowner;
 	u32 newtid;
-	int ret;
+	int ret, err = 0;
 
 	lockdep_assert_held(q->lock_ptr);
 
@@ -2421,14 +2421,17 @@ retry:
 	if (!pi_state->owner)
 		newtid |= FUTEX_OWNER_DIED;
 
-	if (get_futex_value_locked(&uval, uaddr))
-		goto handle_fault;
+	err = get_futex_value_locked(&uval, uaddr);
+	if (err)
+		goto handle_err;
 
 	for (;;) {
 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-			goto handle_fault;
+		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+		if (err)
+			goto handle_err;
+
 		if (curval == uval)
 			break;
 		uval = curval;
@@ -2456,23 +2459,37 @@ retry:
 	return 0;
 
 	/*
-	 * To handle the page fault we need to drop the locks here. That gives
-	 * the other task (either the highest priority waiter itself or the
-	 * task which stole the rtmutex) the chance to try the fixup of the
-	 * pi_state. So once we are back from handling the fault we need to
-	 * check the pi_state after reacquiring the locks and before trying to
-	 * do another fixup. When the fixup has been done already we simply
-	 * return.
+	 * In order to reschedule or handle a page fault, we need to drop the
+	 * locks here. In the case of a fault, this gives the other task
+	 * (either the highest priority waiter itself or the task which stole
+	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
+	 * are back from handling the fault we need to check the pi_state after
+	 * reacquiring the locks and before trying to do another fixup. When
+	 * the fixup has been done already we simply return.
 	 *
 	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
 	 * drop hb->lock since the caller owns the hb -> futex_q relation.
 	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
 	 */
-handle_fault:
+handle_err:
 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 	spin_unlock(q->lock_ptr);
 
-	ret = fault_in_user_writeable(uaddr);
+	switch (err) {
+	case -EFAULT:
+		ret = fault_in_user_writeable(uaddr);
+		break;
+
+	case -EAGAIN:
+		cond_resched();
+		ret = 0;
+		break;
+
+	default:
+		WARN_ON_ONCE(1);
+		ret = err;
+		break;
+	}
 
 	spin_lock(q->lock_ptr);
 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -3041,10 +3058,8 @@ retry:
 		 * A unconditional UNLOCK_PI op raced against a waiter
 		 * setting the FUTEX_WAITERS bit. Try again.
 		 */
-		if (ret == -EAGAIN) {
-			put_futex_key(&key);
-			goto retry;
-		}
+		if (ret == -EAGAIN)
+			goto pi_retry;
 		/*
 		 * wake_futex_pi has detected invalid state. Tell user
 		 * space.
@@ -3059,9 +3074,19 @@ retry:
 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
 	 * owner.
 	 */
-	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
 		spin_unlock(&hb->lock);
-		goto pi_faulted;
+		switch (ret) {
+		case -EFAULT:
+			goto pi_faulted;
+
+		case -EAGAIN:
+			goto pi_retry;
+
+		default:
+			WARN_ON_ONCE(1);
+			goto out_putkey;
+		}
 	}
 
 	/*
@@ -3075,6 +3100,11 @@ out_putkey:
 	put_futex_key(&key);
 	return ret;
 
+pi_retry:
+	put_futex_key(&key);
+	cond_resched();
+	goto retry;
+
 pi_faulted:
 	put_futex_key(&key);
 
@@ -3435,6 +3465,7 @@ err_unlock:
 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
 	u32 uval, uninitialized_var(nval), mval;
+	int err;
 
 	/* Futex address must be 32bit aligned */
 	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
@@ -3444,42 +3475,57 @@ retry:
 	if (get_user(uval, uaddr))
 		return -1;
 
-	if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
-		/*
-		 * Ok, this dying thread is truly holding a futex
-		 * of interest. Set the OWNER_DIED bit atomically
-		 * via cmpxchg, and if the value had FUTEX_WAITERS
-		 * set, wake up a waiter (if any). (We have to do a
-		 * futex_wake() even if OWNER_DIED is already set -
-		 * to handle the rare but possible case of recursive
-		 * thread-death.) The rest of the cleanup is done in
-		 * userspace.
-		 */
-		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		/*
-		 * We are not holding a lock here, but we want to have
-		 * the pagefault_disable/enable() protection because
-		 * we want to handle the fault gracefully. If the
-		 * access fails we try to fault in the futex with R/W
-		 * verification via get_user_pages. get_user() above
-		 * does not guarantee R/W access. If that fails we
-		 * give up and leave the futex locked.
-		 */
-		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+		return 0;
+
+	/*
+	 * Ok, this dying thread is truly holding a futex
+	 * of interest. Set the OWNER_DIED bit atomically
+	 * via cmpxchg, and if the value had FUTEX_WAITERS
+	 * set, wake up a waiter (if any). (We have to do a
+	 * futex_wake() even if OWNER_DIED is already set -
+	 * to handle the rare but possible case of recursive
+	 * thread-death.) The rest of the cleanup is done in
+	 * userspace.
+	 */
+	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+	/*
+	 * We are not holding a lock here, but we want to have
+	 * the pagefault_disable/enable() protection because
+	 * we want to handle the fault gracefully. If the
+	 * access fails we try to fault in the futex with R/W
+	 * verification via get_user_pages. get_user() above
+	 * does not guarantee R/W access. If that fails we
+	 * give up and leave the futex locked.
+	 */
+	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+		switch (err) {
+		case -EFAULT:
 			if (fault_in_user_writeable(uaddr))
 				return -1;
 			goto retry;
-		}
-		if (nval != uval)
+
+		case -EAGAIN:
+			cond_resched();
 			goto retry;
 
-		/*
-		 * Wake robust non-PI futexes here. The wakeup of
-		 * PI futexes happens in exit_pi_state():
-		 */
-		if (!pi && (uval & FUTEX_WAITERS))
-			futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+		default:
+			WARN_ON_ONCE(1);
+			return err;
+		}
 	}
+
+	if (nval != uval)
+		goto retry;
+
+	/*
+	 * Wake robust non-PI futexes here. The wakeup of
+	 * PI futexes happens in exit_pi_state():
+	 */
+	if (!pi && (uval & FUTEX_WAITERS))
+		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From e43e9c339a78a0978f4ce473f645cedc05e6a57c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Apr 2019 13:51:03 -0400
Subject: fsnotify: switch send_to_group() and ->handle_event to const struct
 qstr *

note that conditions surrounding accesses to dname in audit_watch_handle_event()
and audit_mark_handle_event() guarantee that dname won't have been NULL.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_fsnotify.c | 4 ++--
 kernel/audit_tree.c     | 2 +-
 kernel/audit_watch.c    | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 37ae95cfb7f4..fb241805569c 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -164,7 +164,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 static int audit_mark_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
 				    u32 mask, const void *data, int data_type,
-				    const unsigned char *dname, u32 cookie,
+				    const struct qstr *dname, u32 cookie,
 				    struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
@@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 	}
 
 	if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
-		if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+		if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL))
 			return 0;
 		audit_update_mark(audit_mark, inode);
 	} else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index abfb112f26aa..e49c912f862d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk)
 static int audit_tree_handle_event(struct fsnotify_group *group,
 				   struct inode *to_tell,
 				   u32 mask, const void *data, int data_type,
-				   const unsigned char *file_name, u32 cookie,
+				   const struct qstr *file_name, u32 cookie,
 				   struct fsnotify_iter_info *iter_info)
 {
 	return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e8d1adeb2223..3c12fd5b680e 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -482,7 +482,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
 				    u32 mask, const void *data, int data_type,
-				    const unsigned char *dname, u32 cookie,
+				    const struct qstr *dname, u32 cookie,
 				    struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
@@ -507,9 +507,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 	}
 
 	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
-		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+		audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0);
 	else if (mask & (FS_DELETE|FS_MOVED_FROM))
-		audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
+		audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
 	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
 		audit_remove_parent_watches(parent);
 
-- 
cgit v1.2.3


From 6921d4ebe418e7cce9f65c1f38c93ea82a1f546c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Apr 2019 14:09:49 -0400
Subject: audit_update_watch(): switch to const struct qstr *

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_watch.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3c12fd5b680e..d832ce9df065 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -255,18 +255,19 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 
 /* Update inode info in audit rules based on filesystem event. */
 static void audit_update_watch(struct audit_parent *parent,
-			       const char *dname, dev_t dev,
+			       const struct qstr *dname, dev_t dev,
 			       unsigned long ino, unsigned invalidating)
 {
 	struct audit_watch *owatch, *nwatch, *nextw;
 	struct audit_krule *r, *nextr;
 	struct audit_entry *oentry, *nentry;
+	const unsigned char *name = dname->name;
 
 	mutex_lock(&audit_filter_mutex);
 	/* Run all of the watches on this parent looking for the one that
 	 * matches the given dname */
 	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-		if (audit_compare_dname_path(dname, owatch->path,
+		if (audit_compare_dname_path(name, owatch->path,
 					     AUDIT_NAME_FULL))
 			continue;
 
@@ -507,9 +508,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 	}
 
 	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
-		audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0);
+		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
 	else if (mask & (FS_DELETE|FS_MOVED_FROM))
-		audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
+		audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
 	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
 		audit_remove_parent_watches(parent);
 
-- 
cgit v1.2.3


From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Fri, 26 Apr 2019 11:49:47 -0700
Subject: bpf: add writable context for raw tracepoints

This is an opt-in interface that allows a tracepoint to provide a safe
buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program.
The size of the buffer must be a compile-time constant, and is checked
before allowing a BPF program to attach to a tracepoint that uses this
feature.

The pointer to this buffer will be the first argument of tracepoints
that opt in; the pointer is valid and can be bpf_probe_read() by both
BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE
programs that attach to such a tracepoint, but the buffer to which it
points may only be written by the latter.

Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c     |  8 ++++++--
 kernel/bpf/verifier.c    | 31 +++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b0de49598341..ae141e745f92 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1789,12 +1789,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	}
 	raw_tp->btp = btp;
 
-	prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
-				 BPF_PROG_TYPE_RAW_TRACEPOINT);
+	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
 	if (IS_ERR(prog)) {
 		err = PTR_ERR(prog);
 		goto out_free_tp;
 	}
+	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+		err = -EINVAL;
+		goto out_put_prog;
+	}
 
 	err = bpf_probe_register(raw_tp->btp, prog);
 	if (err)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 423f242a5efb..2ef442c62c0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -405,6 +405,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
 	[PTR_TO_TCP_SOCK]	= "tcp_sock",
 	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
+	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 };
 
 static char slot_type_char[] = {
@@ -1993,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  int regno, int off, int size)
+{
+	if (off < 0) {
+		verbose(env,
+			"R%d invalid tracepoint buffer access: off=%d, size=%d",
+			regno, off, size);
+		return -EACCES;
+	}
+	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env,
+			"R%d invalid variable buffer offset: off=%d, var_off=%s",
+			regno, off, tn_buf);
+		return -EACCES;
+	}
+	if (off + size > env->prog->aux->max_tp_access)
+		env->prog->aux->max_tp_access = off + size;
+
+	return 0;
+}
+
+
 /* truncate register to smaller size (in bytes)
  * must be called with size < BPF_REG_SIZE
  */
@@ -2137,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_sock_access(env, insn_idx, regno, off, size, t);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_TP_BUFFER) {
+		err = check_tp_buffer_access(env, reg, regno, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 91800be0c8eb..8607aba1d882 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -915,6 +915,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
 const struct bpf_prog_ops raw_tracepoint_prog_ops = {
 };
 
+static bool raw_tp_writable_prog_is_valid_access(int off, int size,
+						 enum bpf_access_type type,
+						 const struct bpf_prog *prog,
+						 struct bpf_insn_access_aux *info)
+{
+	if (off == 0) {
+		if (size != sizeof(u64) || type != BPF_READ)
+			return false;
+		info->reg_type = PTR_TO_TP_BUFFER;
+	}
+	return raw_tp_prog_is_valid_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
+	.get_func_proto  = raw_tp_prog_func_proto,
+	.is_valid_access = raw_tp_writable_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
@@ -1204,6 +1225,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 	if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
 		return -EINVAL;
 
+	if (prog->aux->max_tp_access > btp->writable_size)
+		return -EINVAL;
+
 	return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
 }
 
-- 
cgit v1.2.3


From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 26 Apr 2019 16:39:39 -0700
Subject: bpf: Introduce bpf sk local storage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After allowing a bpf prog to
- directly read the skb->sk ptr
- get the fullsock bpf_sock by "bpf_sk_fullsock()"
- get the bpf_tcp_sock by "bpf_tcp_sock()"
- get the listener sock by "bpf_get_listener_sock()"
- avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock"
  into different bpf running context.

this patch is another effort to make bpf's network programming
more intuitive to do (together with memory and performance benefit).

When bpf prog needs to store data for a sk, the current practice is to
define a map with the usual 4-tuples (src/dst ip/port) as the key.
If multiple bpf progs require to store different sk data, multiple maps
have to be defined.  Hence, wasting memory to store the duplicated
keys (i.e. 4 tuples here) in each of the bpf map.
[ The smallest key could be the sk pointer itself which requires
  some enhancement in the verifier and it is a separate topic. ]

Also, the bpf prog needs to clean up the elem when sk is freed.
Otherwise, the bpf map will become full and un-usable quickly.
The sk-free tracking currently could be done during sk state
transition (e.g. BPF_SOCK_OPS_STATE_CB).

The size of the map needs to be predefined which then usually ended-up
with an over-provisioned map in production.  Even the map was re-sizable,
while the sk naturally come and go away already, this potential re-size
operation is arguably redundant if the data can be directly connected
to the sk itself instead of proxy-ing through a bpf map.

This patch introduces sk->sk_bpf_storage to provide local storage space
at sk for bpf prog to use.  The space will be allocated when the first bpf
prog has created data for this particular sk.

The design optimizes the bpf prog's lookup (and then optionally followed by
an inline update).  bpf_spin_lock should be used if the inline update needs
to be protected.

BPF_MAP_TYPE_SK_STORAGE:
-----------------------
To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in
this patch) needs to be created.  Multiple BPF_MAP_TYPE_SK_STORAGE maps can
be created to fit different bpf progs' needs.  The map enforces
BTF to allow printing the sk-local-storage during a system-wise
sk dump (e.g. "ss -ta") in the future.

The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete
a "sk-local-storage" data from a particular sk.
Think of the map as a meta-data (or "type") of a "sk-local-storage".  This
particular "type" of "sk-local-storage" data can then be stored in any sk.

The main purposes of this map are mostly:
1. Define the size of a "sk-local-storage" type.
2. Provide a similar syscall userspace API as the map (e.g. lookup/update,
   map-id, map-btf...etc.)
3. Keep track of all sk's storages of this "type" and clean them up
   when the map is freed.

sk->sk_bpf_storage:
------------------
The main lookup/update/delete is done on sk->sk_bpf_storage (which
is a "struct bpf_sk_storage").  When doing a lookup,
the "map" pointer is now used as the "key" to search on the
sk_storage->list.  The "map" pointer is actually serving
as the "type" of the "sk-local-storage" that is being
requested.

To allow very fast lookup, it should be as fast as looking up an
array at a stable-offset.  At the same time, it is not ideal to
set a hard limit on the number of sk-local-storage "type" that the
system can have.  Hence, this patch takes a cache approach.
The last search result from sk_storage->list is cached in
sk_storage->cache[] which is a stable sized array.  Each
"sk-local-storage" type has a stable offset to the cache[] array.
In the future, a map's flag could be introduced to do cache
opt-out/enforcement if it became necessary.

The cache size is 16 (i.e. 16 types of "sk-local-storage").
Programs can share map.  On the program side, having a few bpf_progs
running in the networking hotpath is already a lot.  The bpf_prog
should have already consolidated the existing sock-key-ed map usage
to minimize the map lookup penalty.  16 has enough runway to grow.

All sk-local-storage data will be removed from sk->sk_bpf_storage
during sk destruction.

bpf_sk_storage_get() and bpf_sk_storage_delete():
------------------------------------------------
Instead of using bpf_map_(lookup|update|delete)_elem(),
the bpf prog needs to use the new helper bpf_sk_storage_get() and
bpf_sk_storage_delete().  The verifier can then enforce the
ARG_PTR_TO_SOCKET argument.  The bpf_sk_storage_get() also allows to
"create" new elem if one does not exist in the sk.  It is done by
the new BPF_SK_STORAGE_GET_F_CREATE flag.  An optional value can also be
provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE.
The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock.  Together,
it has eliminated the potential use cases for an equivalent
bpf_map_update_elem() API (for bpf_prog) in this patch.

Misc notes:
----------
1. map_get_next_key is not supported.  From the userspace syscall
   perspective,  the map has the socket fd as the key while the map
   can be shared by pinned-file or map-id.

   Since btf is enforced, the existing "ss" could be enhanced to pretty
   print the local-storage.

   Supporting a kernel defined btf with 4 tuples as the return key could
   be explored later also.

2. The sk->sk_lock cannot be acquired.  Atomic operations is used instead.
   e.g. cmpxchg is done on the sk->sk_bpf_storage ptr.
   Please refer to the source code comments for the details in
   synchronization cases and considerations.

3. The mem is charged to the sk->sk_omem_alloc as the sk filter does.

Benchmark:
---------
Here is the benchmark data collected by turning on
the "kernel.bpf_stats_enabled" sysctl.
Two bpf progs are tested:

One bpf prog with the usual bpf hashmap (max_entries = 8192) with the
sk ptr as the key. (verifier is modified to support sk ptr as the key
That should have shortened the key lookup time.)

Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE.

Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for
each egress skb and then bump the cnt.  netperf is used to drive
data with 4096 connected UDP sockets.

BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run)
27: cgroup_skb  name egress_sk_map  tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633
    loaded_at 2019-04-15T13:46:39-0700  uid 0
    xlated 344B  jited 258B  memlock 4096B  map_ids 16
    btf_id 5

BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run)
30: cgroup_skb  name egress_sk_stora  tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739
    loaded_at 2019-04-15T13:47:54-0700  uid 0
    xlated 168B  jited 156B  memlock 4096B  map_ids 17
    btf_id 6

Here is a high-level picture on how are the objects organized:

       sk
    ┌──────┐
    │      │
    │      │
    │      │
    │*sk_bpf_storage─────▶ bpf_sk_storage
    └──────┘                 ┌───────┐
                 ┌───────────┤ list  │
                 │           │       │
                 │           │       │
                 │           │       │
                 │           └───────┘
                 │
                 │     elem
                 │  ┌────────┐
                 ├─▶│ snode  │
                 │  ├────────┤
                 │  │  data  │          bpf_map
                 │  ├────────┤        ┌─────────┐
                 │  │map_node│◀─┬─────┤  list   │
                 │  └────────┘  │     │         │
                 │              │     │         │
                 │     elem     │     │         │
                 │  ┌────────┐  │     └─────────┘
                 └─▶│ snode  │  │
                    ├────────┤  │
   bpf_map          │  data  │  │
 ┌─────────┐        ├────────┤  │
 │  list   ├───────▶│map_node│  │
 │         │        └────────┘  │
 │         │                    │
 │         │           elem     │
 └─────────┘        ┌────────┐  │
                 ┌─▶│ snode  │  │
                 │  ├────────┤  │
                 │  │  data  │  │
                 │  ├────────┤  │
                 │  │map_node│◀─┘
                 │  └────────┘
                 │
                 │
                 │          ┌───────┐
     sk          └──────────│ list  │
  ┌──────┐                  │       │
  │      │                  │       │
  │      │                  │       │
  │      │                  └───────┘
  │*sk_bpf_storage───────▶bpf_sk_storage
  └──────┘

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c  |  3 ++-
 kernel/bpf/verifier.c | 27 ++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ae141e745f92..ad3ccf82f31d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			return -EACCES;
 		if (map->map_type != BPF_MAP_TYPE_HASH &&
 		    map->map_type != BPF_MAP_TYPE_ARRAY &&
-		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_SK_STORAGE)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2ef442c62c0e..271717246af3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2543,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE ||
-	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
+	    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
 		expected_type = PTR_TO_STACK;
-		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
-		    type != expected_type)
+		if (register_is_null(reg) &&
+		    arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
+			/* final test in check_stack_boundary() */;
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
+			 type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2578,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			}
 			meta->ref_obj_id = reg->ref_obj_id;
 		}
+	} else if (arg_type == ARG_PTR_TO_SOCKET) {
+		expected_type = PTR_TO_SOCKET;
+		if (type != expected_type)
+			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -2635,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 					      meta->map_ptr->key_size, false,
 					      NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+		   (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
+		    !register_is_null(reg)) ||
 		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -2784,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_map_push_elem)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SK_STORAGE:
+		if (func_id != BPF_FUNC_sk_storage_get &&
+		    func_id != BPF_FUNC_sk_storage_delete)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2847,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    map->map_type != BPF_MAP_TYPE_STACK)
 			goto error;
 		break;
+	case BPF_FUNC_sk_storage_get:
+	case BPF_FUNC_sk_storage_delete:
+		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+			goto error;
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 26 Apr 2019 11:13:06 +0200
Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag

Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most
netlink based interfaces (including recently added ones) are still not
setting it in kernel generated messages. Without the flag, message parsers
not aware of attribute semantics (e.g. wireshark dissector or libmnl's
mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display
the structure of their contents.

Unfortunately we cannot just add the flag everywhere as there may be
userspace applications which check nlattr::nla_type directly rather than
through a helper masking out the flags. Therefore the patch renames
nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start()
as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually
are rewritten to use nla_nest_start().

Except for changes in include/net/netlink.h, the patch was generated using
this semantic patch:

@@ expression E1, E2; @@
-nla_nest_start(E1, E2)
+nla_nest_start_noflag(E1, E2)

@@ expression E1, E2; @@
-nla_nest_start_noflag(E1, E2 | NLA_F_NESTED)
+nla_nest_start(E1, E2)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 1b942a7caf26..ef4f9cd980fd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -375,7 +375,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 			? TASKSTATS_TYPE_AGGR_PID
 			: TASKSTATS_TYPE_AGGR_TGID;
 
-	na = nla_nest_start(skb, aggr);
+	na = nla_nest_start_noflag(skb, aggr);
 	if (!na)
 		goto err;
 
-- 
cgit v1.2.3


From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:28 +0200
Subject: netlink: make validation more configurable for future strictness

We currently have two levels of strict validation:

 1) liberal (default)
     - undefined (type >= max) & NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted
     - garbage at end of message accepted
 2) strict (opt-in)
     - NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted

Split out parsing strictness into four different options:
 * TRAILING     - check that there's no trailing data after parsing
                  attributes (in message or nested)
 * MAXTYPE      - reject attrs > max known type
 * UNSPEC       - reject attributes with NLA_UNSPEC policy entries
 * STRICT_ATTRS - strictly validate attribute size

The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().

Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.

We end up with the following renames:
 * nla_parse           -> nla_parse_deprecated
 * nla_parse_strict    -> nla_parse_deprecated_strict
 * nlmsg_parse         -> nlmsg_parse_deprecated
 * nlmsg_parse_strict  -> nlmsg_parse_deprecated_strict
 * nla_parse_nested    -> nla_parse_nested_deprecated
 * nla_validate_nested -> nla_validate_nested_deprecated

Using spatch, of course:
    @@
    expression TB, MAX, HEAD, LEN, POL, EXT;
    @@
    -nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
    +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression TB, MAX, NLA, POL, EXT;
    @@
    -nla_parse_nested(TB, MAX, NLA, POL, EXT)
    +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)

    @@
    expression START, MAX, POL, EXT;
    @@
    -nla_validate_nested(START, MAX, POL, EXT)
    +nla_validate_nested_deprecated(START, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, MAX, POL, EXT;
    @@
    -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
    +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)

For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.

Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.

Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.

In effect then, this adds fully strict validation for any new command.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ef4f9cd980fd..0e347f1c7800 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -677,8 +677,9 @@ static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 		return -EINVAL;
 	}
 
-	return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX,
-			      policy, info->extack);
+	return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN,
+					 TASKSTATS_CMD_ATTR_MAX, policy,
+					 info->extack);
 }
 
 static struct genl_family family __ro_after_init = {
-- 
cgit v1.2.3


From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:31 +0200
Subject: genetlink: optionally validate strictly/dumps

Add options to strictly validate messages and dump messages,
sometimes perhaps validating dump messages non-strictly may
be required, so add an option for that as well.

Since none of this can really be applied to existing commands,
set the options everwhere using the following spatch:

    @@
    identifier ops;
    expression X;
    @@
    struct genl_ops ops[] = {
    ...,
     {
            .cmd = X,
    +       .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
            ...
     },
    ...
    };

For new commands one should just not copy the .validate 'opt-out'
flags and thus get strict validation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 0e347f1c7800..5f852b8f59f7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -649,12 +649,14 @@ err:
 static const struct genl_ops taskstats_ops[] = {
 	{
 		.cmd		= TASKSTATS_CMD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit		= taskstats_user_cmd,
 		/* policy enforced later */
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL,
 	},
 	{
 		.cmd		= CGROUPSTATS_CMD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit		= cgroupstats_user_cmd,
 		/* policy enforced later */
 		.flags		= GENL_CMD_CAP_HASPOL,
-- 
cgit v1.2.3


From 795d673af1afae8146ac3070a2d77cfae5287c43 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Apr 2019 14:11:05 -0400
Subject: audit_compare_dname_path(): switch to const struct qstr *

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.h          | 2 +-
 kernel/audit_fsnotify.c | 2 +-
 kernel/audit_watch.c    | 3 +--
 kernel/auditfilter.c    | 6 +++---
 kernel/auditsc.c        | 4 ++--
 5 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 958d5b8fc1b3..2071725a999f 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -231,7 +231,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right);
 extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
 extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
 extern int parent_len(const char *path);
-extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
 extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
 					const void *payload, int size);
 extern void		    audit_panic(const char *message);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index fb241805569c..b5737b826951 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 	}
 
 	if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
-		if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL))
+		if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
 			return 0;
 		audit_update_mark(audit_mark, inode);
 	} else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d832ce9df065..b50c574223fa 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -261,13 +261,12 @@ static void audit_update_watch(struct audit_parent *parent,
 	struct audit_watch *owatch, *nwatch, *nextw;
 	struct audit_krule *r, *nextr;
 	struct audit_entry *oentry, *nentry;
-	const unsigned char *name = dname->name;
 
 	mutex_lock(&audit_filter_mutex);
 	/* Run all of the watches on this parent looking for the one that
 	 * matches the given dname */
 	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-		if (audit_compare_dname_path(name, owatch->path,
+		if (audit_compare_dname_path(dname, owatch->path,
 					     AUDIT_NAME_FULL))
 			continue;
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 63f8b3f26fab..f9fff93c3351 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1290,12 +1290,12 @@ int parent_len(const char *path)
  * @parentlen:	length of the parent if known. Passing in AUDIT_NAME_FULL
  * 		here indicates that we must compute this value.
  */
-int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
 {
 	int dlen, pathlen;
 	const char *p;
 
-	dlen = strlen(dname);
+	dlen = dname->len;
 	pathlen = strlen(path);
 	if (pathlen < dlen)
 		return 1;
@@ -1306,7 +1306,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
 
 	p = path + parentlen;
 
-	return strncmp(p, dname, dlen);
+	return strncmp(p, dname->name, dlen);
 }
 
 int audit_filter(int msgtype, unsigned int listtype)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d1eab1d4a930..92d0ae63febd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2045,7 +2045,7 @@ void __audit_inode_child(struct inode *parent,
 {
 	struct audit_context *context = audit_context();
 	struct inode *inode = d_backing_inode(dentry);
-	const char *dname = dentry->d_name.name;
+	const struct qstr *dname = &dentry->d_name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
 	struct audit_entry *e;
 	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
@@ -2099,7 +2099,7 @@ void __audit_inode_child(struct inode *parent,
 		    (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
 			continue;
 
-		if (!strcmp(dname, n->name->name) ||
+		if (!strcmp(dname->name, n->name->name) ||
 		    !audit_compare_dname_path(dname, n->name->name,
 						found_parent ?
 						found_parent->name_len :
-- 
cgit v1.2.3


From 9b019acb72e4b5741d88e8936d6f200ed44b66b2 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 12 Apr 2019 14:26:13 +1000
Subject: sched/nohz: Run NOHZ idle load balancer on HK_FLAG_MISC CPUs

The NOHZ idle balancer runs on the lowest idle CPU. This can
interfere with isolated CPUs, so confine it to HK_FLAG_MISC
housekeeping CPUs.

HK_FLAG_SCHED is not used for this because it is not set anywhere
at the moment. This could be folded into HK_FLAG_SCHED once that
option is fixed.

The problem was observed with increased jitter on an application
running on CPU0, caused by NOHZ idle load balancing being run on
CPU1 (an SMT sibling).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190412042613.28930-1-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 13bafe350abf..7b0da7007da3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9519,22 +9519,26 @@ static inline int on_null_domain(struct rq *rq)
  * - When one of the busy CPUs notice that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ *   anywhere yet.
  */
 
 static inline int find_new_ilb(void)
 {
-	int ilb = cpumask_first(nohz.idle_cpus_mask);
+	int ilb;
 
-	if (ilb < nr_cpu_ids && idle_cpu(ilb))
-		return ilb;
+	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+			      housekeeping_cpumask(HK_FLAG_MISC)) {
+		if (idle_cpu(ilb))
+			return ilb;
+	}
 
 	return nr_cpu_ids;
 }
 
 /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
  */
 static void kick_ilb(unsigned int flags)
 {
-- 
cgit v1.2.3


From 948f83768a180ec8e85c4a8ff269d5e433d10815 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 2 Apr 2019 18:02:44 +0200
Subject: locking/lockdep: Test all incompatible scenarios at once in
 check_irq_usage()

check_prev_add_irq() tests all incompatible scenarios one after the
other while adding a lock (@next) to a tree dependency (@prev):

	LOCK_USED_IN_HARDIRQ          vs         LOCK_ENABLED_HARDIRQ
	LOCK_USED_IN_HARDIRQ_READ     vs         LOCK_ENABLED_HARDIRQ
	LOCK_USED_IN_SOFTIRQ          vs         LOCK_ENABLED_SOFTIRQ
	LOCK_USED_IN_SOFTIRQ_READ     vs         LOCK_ENABLED_SOFTIRQ

Also for these four scenarios, we must at least iterate the @prev
backward dependency. Then if it matches the relevant LOCK_USED_* bit,
we must also iterate the @next forward dependency.

Therefore in the best case we iterate 4 times, in the worst case 8 times.

A different approach can let us divide the number of branch iterations
by 4:

1) Iterate through @prev backward dependencies and accumulate all the IRQ
   uses in a single mask. In the best case where the current lock hasn't
   been used in IRQ, we stop here.

2) Iterate through @next forward dependencies and try to find a lock
   whose usage is exclusive to the accumulated usages gathered in the
   previous step. If we find one (call it @lockA), we have found an
   incompatible use, otherwise we stop here. Only bad locking scenario
   go further. So a sane verification stop here.

3) Iterate again through @prev backward dependency and find the lock
   whose usage matches @lockA in term of incompatibility. Call that
   lock @lockB.

4) Report the incompatible usages of @lockA and @lockB

If no incompatible use is found, the verification never goes beyond
step 2 which means at most two iterations.

The following compares the execution measurements of the function
check_prev_add_irq():

            Number of  calls   | Avg (ns)  | Stdev (ns) | Total time (ns)
  ------------------------------------------------------------------------
  Mainline         8452        |  2652     |    11962   |    22415143
  This patch       8452        |  1518     |     7090   |    12835602

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190402160244.32434-5-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c           | 228 ++++++++++++++++++++++++++-----------
 kernel/locking/lockdep_internals.h |   6 +
 2 files changed, 167 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5e149dd78298..25ecc6d3058b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1676,6 +1676,14 @@ check_redundant(struct lock_list *root, struct lock_class *target,
 }
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+static inline int usage_accumulate(struct lock_list *entry, void *mask)
+{
+	*(unsigned long *)mask |= entry->class->usage_mask;
+
+	return 0;
+}
+
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1687,8 +1695,6 @@ static inline int usage_match(struct lock_list *entry, void *mask)
 	return entry->class->usage_mask & *(unsigned long *)mask;
 }
 
-
-
 /*
  * Find a node in the forwards-direction dependency sub-graph starting
  * at @root->class that matches @bit.
@@ -1922,39 +1928,6 @@ print_bad_irq_dependency(struct task_struct *curr,
 	return 0;
 }
 
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
-	    struct held_lock *next, enum lock_usage_bit bit_backwards,
-	    enum lock_usage_bit bit_forwards, const char *irqclass)
-{
-	int ret;
-	struct lock_list this, that;
-	struct lock_list *uninitialized_var(target_entry);
-	struct lock_list *uninitialized_var(target_entry1);
-
-	this.parent = NULL;
-
-	this.class = hlock_class(prev);
-	ret = find_usage_backwards(&this, lock_flag(bit_backwards), &target_entry);
-	if (ret < 0)
-		return print_bfs_bug(ret);
-	if (ret == 1)
-		return ret;
-
-	that.parent = NULL;
-	that.class = hlock_class(next);
-	ret = find_usage_forwards(&that, lock_flag(bit_forwards), &target_entry1);
-	if (ret < 0)
-		return print_bfs_bug(ret);
-	if (ret == 1)
-		return ret;
-
-	return print_bad_irq_dependency(curr, &this, &that,
-			target_entry, target_entry1,
-			prev, next,
-			bit_backwards, bit_forwards, irqclass);
-}
-
 static const char *state_names[] = {
 #define LOCKDEP_STATE(__STATE) \
 	__stringify(__STATE),
@@ -1977,6 +1950,13 @@ static inline const char *state_name(enum lock_usage_bit bit)
 		return state_names[bit >> LOCK_USAGE_DIR_MASK];
 }
 
+/*
+ * The bit number is encoded like:
+ *
+ *  bit0: 0 exclusive, 1 read lock
+ *  bit1: 0 used in irq, 1 irq enabled
+ *  bit2-n: state
+ */
 static int exclusive_bit(int new_bit)
 {
 	int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1988,45 +1968,160 @@ static int exclusive_bit(int new_bit)
 	return state | (dir ^ LOCK_USAGE_DIR_MASK);
 }
 
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+	unsigned long excl = 0;
+
+	/* Invert dir */
+	excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+	excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+
+	return excl;
+}
+
+/*
+ * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
+ * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
+ * And then mask out all bitnr0.
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+	unsigned long excl = invert_dir_mask(mask);
+
+	/* Strip read */
+	excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+	excl &= ~LOCKF_IRQ_READ;
+
+	return excl;
+}
+
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+	unsigned long excl = invert_dir_mask(mask);
+
+	/* Include read in existing usages */
+	excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+	return excl;
+}
+
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+				unsigned long excl_mask,
+				enum lock_usage_bit *bitp,
+				enum lock_usage_bit *excl_bitp)
+{
+	int bit, excl;
+
+	for_each_set_bit(bit, &mask, LOCK_USED) {
+		excl = exclusive_bit(bit);
+		if (excl_mask & lock_flag(excl)) {
+			*bitp = bit;
+			*excl_bitp = excl;
+			return 0;
+		}
+	}
+	return -1;
+}
+
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
 static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
-			   struct held_lock *next, enum lock_usage_bit bit)
+			   struct held_lock *next)
 {
+	unsigned long usage_mask = 0, forward_mask, backward_mask;
+	enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+	struct lock_list *uninitialized_var(target_entry1);
+	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list this, that;
+	int ret;
+
 	/*
-	 * Prove that the new dependency does not connect a hardirq-safe
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * Step 1: gather all hard/soft IRQs usages backward in an
+	 * accumulated usage mask.
 	 */
-	if (!check_usage(curr, prev, next, bit,
-			   exclusive_bit(bit), state_name(bit)))
-		return 0;
+	this.parent = NULL;
+	this.class = hlock_class(prev);
 
-	bit++; /* _READ */
+	ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+
+	usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+	if (!usage_mask)
+		return 1;
 
 	/*
-	 * Prove that the new dependency does not connect a hardirq-safe-read
-	 * lock with a hardirq-unsafe lock - to achieve this we search
-	 * the backwards-subgraph starting at <prev>, and the
-	 * forwards-subgraph starting at <next>:
+	 * Step 2: find exclusive uses forward that match the previous
+	 * backward accumulated mask.
 	 */
-	if (!check_usage(curr, prev, next, bit,
-			   exclusive_bit(bit), state_name(bit)))
-		return 0;
+	forward_mask = exclusive_mask(usage_mask);
 
-	return 1;
-}
+	that.parent = NULL;
+	that.class = hlock_class(next);
 
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
-		struct held_lock *next)
-{
-#define LOCKDEP_STATE(__STATE)						\
-	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\
-		return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+	ret = find_usage_forwards(&that, forward_mask, &target_entry1);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (ret == 1)
+		return ret;
 
-	return 1;
+	/*
+	 * Step 3: we found a bad match! Now retrieve a lock from the backward
+	 * list whose usage mask matches the exclusive usage mask from the
+	 * lock found on the forward list.
+	 */
+	backward_mask = original_mask(target_entry1->class->usage_mask);
+
+	ret = find_usage_backwards(&this, backward_mask, &target_entry);
+	if (ret < 0)
+		return print_bfs_bug(ret);
+	if (DEBUG_LOCKS_WARN_ON(ret == 1))
+		return 1;
+
+	/*
+	 * Step 4: narrow down to a pair of incompatible usage bits
+	 * and report it.
+	 */
+	ret = find_exclusive_match(target_entry->class->usage_mask,
+				   target_entry1->class->usage_mask,
+				   &backward_bit, &forward_bit);
+	if (DEBUG_LOCKS_WARN_ON(ret == -1))
+		return 1;
+
+	return print_bad_irq_dependency(curr, &this, &that,
+			target_entry, target_entry1,
+			prev, next,
+			backward_bit, forward_bit,
+			state_name(backward_bit));
 }
 
 static void inc_chains(void)
@@ -2043,9 +2138,8 @@ static void inc_chains(void)
 
 #else
 
-static inline int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
-		struct held_lock *next)
+static inline int check_irq_usage(struct task_struct *curr,
+				  struct held_lock *prev, struct held_lock *next)
 {
 	return 1;
 }
@@ -2225,7 +2319,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	else if (unlikely(ret < 0))
 		return print_bfs_bug(ret);
 
-	if (!check_prev_add_irq(curr, prev, next))
+	if (!check_irq_usage(curr, prev, next))
 		return 0;
 
 	/*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 2b3ffd4117ad..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -66,6 +66,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
 	0;
 #undef LOCKDEP_STATE
 
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
+
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
+
 /*
  * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
  * .data and .bss to fit in required 32MB limit for the kernel. With
-- 
cgit v1.2.3


From ad282a8117d5048398f506f20b092c14b3b3c43f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 29 Mar 2019 17:08:52 -0700
Subject: locking/static_key: Add support for deferred static branches

Add deferred static branches.  We can't unfortunately use the
nice trick of encapsulating the entire structure in true/false
variants, because the inside has to be either struct static_key_true
or struct static_key_false.  Use defines to pass the appropriate
members to the helpers separately.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: alexei.starovoitov@gmail.com
Cc: ard.biesheuvel@linaro.org
Cc: oss-drivers@netronome.com
Cc: yamada.masahiro@socionext.com
Link: https://lkml.kernel.org/r/20190330000854.30142-2-jakub.kicinski@netronome.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a799b1ac6b2f..73bbbaddbd9c 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -244,12 +244,13 @@ static void __static_key_slow_dec(struct static_key *key,
 	cpus_read_unlock();
 }
 
-static void jump_label_update_timeout(struct work_struct *work)
+void jump_label_update_timeout(struct work_struct *work)
 {
 	struct static_key_deferred *key =
 		container_of(work, struct static_key_deferred, work.work);
 	__static_key_slow_dec(&key->key, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(jump_label_update_timeout);
 
 void static_key_slow_dec(struct static_key *key)
 {
@@ -264,19 +265,21 @@ void static_key_slow_dec_cpuslocked(struct static_key *key)
 	__static_key_slow_dec_cpuslocked(key, 0, NULL);
 }
 
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
+void __static_key_slow_dec_deferred(struct static_key *key,
+				    struct delayed_work *work,
+				    unsigned long timeout)
 {
 	STATIC_KEY_CHECK_USE(key);
-	__static_key_slow_dec(&key->key, key->timeout, &key->work);
+	__static_key_slow_dec(key, timeout, work);
 }
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
 
-void static_key_deferred_flush(struct static_key_deferred *key)
+void __static_key_deferred_flush(void *key, struct delayed_work *work)
 {
 	STATIC_KEY_CHECK_USE(key);
-	flush_delayed_work(&key->work);
+	flush_delayed_work(work);
 }
-EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
 
 void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
-- 
cgit v1.2.3


From b92e793bbe4a1c49dbf78d8d526561e7a7dd568a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 29 Mar 2019 17:08:53 -0700
Subject: locking/static_key: Factor out the fast path of static_key_slow_dec()

static_key_slow_dec() checks if the atomic enable count is larger
than 1, and if so there decrements it before taking the jump_label_lock.
Move this logic into a helper for reuse in rate limitted keys.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: alexei.starovoitov@gmail.com
Cc: ard.biesheuvel@linaro.org
Cc: oss-drivers@netronome.com
Cc: yamada.masahiro@socionext.com
Link: https://lkml.kernel.org/r/20190330000854.30142-3-jakub.kicinski@netronome.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 73bbbaddbd9c..02c3d11264dd 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -202,13 +202,13 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
 
-static void __static_key_slow_dec_cpuslocked(struct static_key *key,
-					   unsigned long rate_limit,
-					   struct delayed_work *work)
+static bool static_key_slow_try_dec(struct static_key *key)
 {
 	int val;
 
-	lockdep_assert_cpus_held();
+	val = atomic_fetch_add_unless(&key->enabled, -1, 1);
+	if (val == 1)
+		return false;
 
 	/*
 	 * The negative count check is valid even when a negative
@@ -217,11 +217,18 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
 	 * returns is unbalanced, because all other static_key_slow_inc()
 	 * instances block while the update is in progress.
 	 */
-	val = atomic_fetch_add_unless(&key->enabled, -1, 1);
-	if (val != 1) {
-		WARN(val < 0, "jump label: negative count!\n");
+	WARN(val < 0, "jump label: negative count!\n");
+	return true;
+}
+
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
+					   unsigned long rate_limit,
+					   struct delayed_work *work)
+{
+	lockdep_assert_cpus_held();
+
+	if (static_key_slow_try_dec(key))
 		return;
-	}
 
 	jump_label_lock();
 	if (atomic_dec_and_test(&key->enabled)) {
-- 
cgit v1.2.3


From 94b5f312cfb4a66055d9b688dc9ab6b297eb9dcc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 29 Mar 2019 17:08:54 -0700
Subject: locking/static_key: Don't take sleeping locks in
 __static_key_slow_dec_deferred()

Changing jump_label state is protected by jump_label_lock().
Rate limited static_key_slow_dec(), however, will never
directly call jump_label_update(), it will schedule a delayed
work instead.  Therefore it's unnecessary to take both the
cpus_read_lock() and jump_label_lock().

This allows static_key_slow_dec_deferred() to be called
from atomic contexts, like socket destructing in net/tls,
without the need for another indirection.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: alexei.starovoitov@gmail.com
Cc: ard.biesheuvel@linaro.org
Cc: oss-drivers@netronome.com
Cc: yamada.masahiro@socionext.com
Link: https://lkml.kernel.org/r/20190330000854.30142-4-jakub.kicinski@netronome.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 02c3d11264dd..de6efdecc70d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -221,9 +221,7 @@ static bool static_key_slow_try_dec(struct static_key *key)
 	return true;
 }
 
-static void __static_key_slow_dec_cpuslocked(struct static_key *key,
-					   unsigned long rate_limit,
-					   struct delayed_work *work)
+static void __static_key_slow_dec_cpuslocked(struct static_key *key)
 {
 	lockdep_assert_cpus_held();
 
@@ -231,23 +229,15 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
 		return;
 
 	jump_label_lock();
-	if (atomic_dec_and_test(&key->enabled)) {
-		if (rate_limit) {
-			atomic_inc(&key->enabled);
-			schedule_delayed_work(work, rate_limit);
-		} else {
-			jump_label_update(key);
-		}
-	}
+	if (atomic_dec_and_test(&key->enabled))
+		jump_label_update(key);
 	jump_label_unlock();
 }
 
-static void __static_key_slow_dec(struct static_key *key,
-				  unsigned long rate_limit,
-				  struct delayed_work *work)
+static void __static_key_slow_dec(struct static_key *key)
 {
 	cpus_read_lock();
-	__static_key_slow_dec_cpuslocked(key, rate_limit, work);
+	__static_key_slow_dec_cpuslocked(key);
 	cpus_read_unlock();
 }
 
@@ -255,21 +245,21 @@ void jump_label_update_timeout(struct work_struct *work)
 {
 	struct static_key_deferred *key =
 		container_of(work, struct static_key_deferred, work.work);
-	__static_key_slow_dec(&key->key, 0, NULL);
+	__static_key_slow_dec(&key->key);
 }
 EXPORT_SYMBOL_GPL(jump_label_update_timeout);
 
 void static_key_slow_dec(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE(key);
-	__static_key_slow_dec(key, 0, NULL);
+	__static_key_slow_dec(key);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
 void static_key_slow_dec_cpuslocked(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE(key);
-	__static_key_slow_dec_cpuslocked(key, 0, NULL);
+	__static_key_slow_dec_cpuslocked(key);
 }
 
 void __static_key_slow_dec_deferred(struct static_key *key,
@@ -277,7 +267,11 @@ void __static_key_slow_dec_deferred(struct static_key *key,
 				    unsigned long timeout)
 {
 	STATIC_KEY_CHECK_USE(key);
-	__static_key_slow_dec(key, timeout, work);
+
+	if (static_key_slow_try_dec(key))
+		return;
+
+	schedule_delayed_work(work, timeout);
 }
 EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
 
-- 
cgit v1.2.3


From 7a5da02de8d6eafba99556f8c98e5313edebb449 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Thu, 18 Apr 2019 16:24:50 +0200
Subject: locking/lockdep: check for freed initmem in static_obj()

The following warning occurred on s390:
WARNING: CPU: 0 PID: 804 at kernel/locking/lockdep.c:1025 lockdep_register_key+0x30/0x150

This is because the check in static_obj() assumes that all memory within
[_stext, _end] belongs to static objects, which at least for s390 isn't
true. The init section is also part of this range, and freeing it allows
the buddy allocator to allocate memory from it. We have virt == phys for
the kernel on s390, so that such allocations would then have addresses
within the range [_stext, _end].

To fix this, introduce arch_is_kernel_initmem_freed(), similar to
arch_is_kernel_text/data(), and add it to the checks in static_obj().
This will always return 0 on architectures that do not define
arch_is_kernel_initmem_freed. On s390, it will return 1 if initmem has
been freed and the address is in the range [__init_begin, __init_end].

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/locking/lockdep.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..22a99530983e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -649,6 +649,9 @@ static int static_obj(const void *obj)
 		      end   = (unsigned long) &_end,
 		      addr  = (unsigned long) obj;
 
+	if (arch_is_kernel_initmem_freed(addr))
+		return 0;
+
 	/*
 	 * static variable?
 	 */
-- 
cgit v1.2.3


From 3d9a8072915366b5932beeed97f158f8d4955768 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:44:54 +0200
Subject: tracing: Cleanup stack trace code

- Remove the extra array member of stack_dump_trace[] along with the
  ARRAY_SIZE - 1 initialization for struct stack_trace :: max_entries.

  Both are historical leftovers of no value. The stack tracer never exceeds
  the array and there is no extra storage requirement either.

- Make variables which are only used in trace_stack.c static.

- Simplify the enable/disable logic.

- Rename stack_trace_print() as it's using the stack_trace_ namespace. Free
  the name up for stack trace related functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094801.230654524@linutronix.de
---
 kernel/trace/trace_stack.c | 42 +++++++++++++-----------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c6e54ff25cae..4efda5f75a0f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,30 +18,26 @@
 
 #include "trace.h"
 
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES + 1];
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
+#define STACK_TRACE_ENTRIES 500
+
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
 
-/*
- * Reserve one entry for the passed in ip. This will allow
- * us to remove most or all of the stack size overhead
- * added by the stack tracer itself.
- */
 struct stack_trace stack_trace_max = {
-	.max_entries		= STACK_TRACE_ENTRIES - 1,
+	.max_entries		= STACK_TRACE_ENTRIES,
 	.entries		= &stack_dump_trace[0],
 };
 
-unsigned long stack_trace_max_size;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
 DEFINE_PER_CPU(int, disable_stack_tracer);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 
 int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
 
-void stack_trace_print(void)
+static void print_max_stack(void)
 {
 	long i;
 	int size;
@@ -61,16 +57,7 @@ void stack_trace_print(void)
 	}
 }
 
-/*
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- *     stack_trace_index[]
- *     stack_trace_max
- *     stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
+static void check_stack(unsigned long ip, unsigned long *stack)
 {
 	unsigned long this_size, flags; unsigned long *p, *top, *start;
 	static int tracer_frame;
@@ -179,7 +166,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 	stack_trace_max.nr_entries = x;
 
 	if (task_stack_end_corrupted(current)) {
-		stack_trace_print();
+		print_max_stack();
 		BUG();
 	}
 
@@ -412,23 +399,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos)
 {
+	int was_enabled;
 	int ret;
 
 	mutex_lock(&stack_sysctl_mutex);
+	was_enabled = !!stack_tracer_enabled;
 
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
-	if (ret || !write ||
-	    (last_stack_tracer_enabled == !!stack_tracer_enabled))
+	if (ret || !write || (was_enabled == !!stack_tracer_enabled))
 		goto out;
 
-	last_stack_tracer_enabled = !!stack_tracer_enabled;
-
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
 	else
 		unregister_ftrace_function(&trace_ops);
-
  out:
 	mutex_unlock(&stack_sysctl_mutex);
 	return ret;
@@ -444,7 +429,6 @@ static __init int enable_stacktrace(char *str)
 		strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
 
 	stack_tracer_enabled = 1;
-	last_stack_tracer_enabled = 1;
 	return 1;
 }
 __setup("stacktrace", enable_stacktrace);
-- 
cgit v1.2.3


From e9b98e162aa53cbea7c8b0d6c9d5dc6e0f822b9c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:44:55 +0200
Subject: stacktrace: Provide helpers for common stack trace operations

All operations with stack traces are based on struct stack_trace. That's a
horrible construct as the struct is a kitchen sink for input and
output. Quite some usage sites embed it into their own data structures
which creates weird indirections.

There is absolutely no point in doing so. For all use cases a storage array
and the number of valid stack trace entries in the array is sufficient.

Provide helper functions which avoid the struct stack_trace indirection so
the usage sites can be cleaned up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094801.324810708@linutronix.de
---
 kernel/stacktrace.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 155 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..b38333b3bc18 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -11,35 +11,54 @@
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
 
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries:	Pointer to storage array
+ * @nr_entries:	Number of entries in the storage array
+ * @spaces:	Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+		       int spaces)
 {
-	int i;
+	unsigned int i;
 
-	if (WARN_ON(!trace->entries))
+	if (WARN_ON(!entries))
 		return;
 
-	for (i = 0; i < trace->nr_entries; i++)
-		printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+	for (i = 0; i < nr_entries; i++)
+		printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
+}
+EXPORT_SYMBOL_GPL(stack_trace_print);
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+	stack_trace_print(trace->entries, trace->nr_entries, spaces);
 }
 EXPORT_SYMBOL_GPL(print_stack_trace);
 
-int snprint_stack_trace(char *buf, size_t size,
-			struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf:	Pointer to the print buffer
+ * @size:	Size of the print buffer
+ * @entries:	Pointer to storage array
+ * @nr_entries:	Number of entries in the storage array
+ * @spaces:	Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+			unsigned int nr_entries, int spaces)
 {
-	int i;
-	int generated;
-	int total = 0;
+	unsigned int generated, i, total = 0;
 
-	if (WARN_ON(!trace->entries))
+	if (WARN_ON(!entries))
 		return 0;
 
-	for (i = 0; i < trace->nr_entries; i++) {
+	for (i = 0; i < nr_entries && size; i++) {
 		generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
-				     (void *)trace->entries[i]);
+				     (void *)entries[i]);
 
 		total += generated;
-
-		/* Assume that generated isn't a negative number */
 		if (generated >= size) {
 			buf += size;
 			size = 0;
@@ -51,6 +70,14 @@ int snprint_stack_trace(char *buf, size_t size,
 
 	return total;
 }
+EXPORT_SYMBOL_GPL(stack_trace_snprint);
+
+int snprint_stack_trace(char *buf, size_t size,
+			struct stack_trace *trace, int spaces)
+{
+	return stack_trace_snprint(buf, size, trace->entries,
+				   trace->nr_entries, spaces);
+}
 EXPORT_SYMBOL_GPL(snprint_stack_trace);
 
 /*
@@ -77,3 +104,116 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
 	WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
 	return -ENOSYS;
 }
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+			      unsigned int skipnr)
+{
+	struct stack_trace trace = {
+		.entries	= store,
+		.max_entries	= size,
+		.skip		= skipnr + 1,
+	};
+
+	save_stack_trace(&trace);
+	return trace.nr_entries;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task:	The task to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+				  unsigned long *store, unsigned int size,
+				  unsigned int skipnr)
+{
+	struct stack_trace trace = {
+		.entries	= store,
+		.max_entries	= size,
+		.skip		= skipnr + 1,
+	};
+
+	save_stack_trace_tsk(task, &trace);
+	return trace.nr_entries;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs:	Pointer to pt_regs to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+				   unsigned int size, unsigned int skipnr)
+{
+	struct stack_trace trace = {
+		.entries	= store,
+		.max_entries	= size,
+		.skip		= skipnr,
+	};
+
+	save_stack_trace_regs(regs, &trace);
+	return trace.nr_entries;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk:	Pointer to the task to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ *
+ * Return:	An error if it detects any unreliable features of the
+ *		stack. Otherwise it guarantees that the stack trace is
+ *		reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+				  unsigned int size)
+{
+	struct stack_trace trace = {
+		.entries	= store,
+		.max_entries	= size,
+	};
+	int ret = save_stack_trace_tsk_reliable(tsk, &trace);
+
+	return ret ? ret : trace.nr_entries;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+	struct stack_trace trace = {
+		.entries	= store,
+		.max_entries	= size,
+	};
+
+	save_stack_trace_user(&trace);
+	return trace.nr_entries;
+}
+#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
-- 
cgit v1.2.3


From 1b59562d3ab09dcef188eb8055d05f0336380394 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:44:57 +0200
Subject: backtrace-test: Simplify stack trace handling

Replace the indirection through struct stack_trace by using the storage
array based interfaces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094801.501919093@linutronix.de
---
 kernel/backtracetest.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a563c8fdad0d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void)
 #ifdef CONFIG_STACKTRACE
 static void backtrace_test_saved(void)
 {
-	struct stack_trace trace;
 	unsigned long entries[8];
+	unsigned int nr_entries;
 
 	pr_info("Testing a saved backtrace.\n");
 	pr_info("The following trace is a kernel self test and not a bug!\n");
 
-	trace.nr_entries = 0;
-	trace.max_entries = ARRAY_SIZE(entries);
-	trace.entries = entries;
-	trace.skip = 0;
-
-	save_stack_trace(&trace);
-	print_stack_trace(&trace, 0);
+	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+	stack_trace_print(entries, nr_entries, 0);
 }
 #else
 static void backtrace_test_saved(void)
-- 
cgit v1.2.3


From f93877214a83e88373b20801c2d671923d03d07d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:44:59 +0200
Subject: latency_top: Simplify stack trace handling

Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094801.683039030@linutronix.de
---
 kernel/latencytop.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index f5a90ab3c6b9..99a5b5f46dc5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
 	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
 }
 
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
-					struct latency_record *lat)
-{
-	struct stack_trace trace;
-
-	memset(&trace, 0, sizeof(trace));
-	trace.max_entries = LT_BACKTRACEDEPTH;
-	trace.entries = &lat->backtrace[0];
-	save_stack_trace_tsk(tsk, &trace);
-}
-
 /**
  * __account_scheduler_latency - record an occurred latency
  * @tsk - the task struct of the task hitting the latency
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	lat.count = 1;
 	lat.time = usecs;
 	lat.max = usecs;
-	store_stacktrace(tsk, &lat);
+
+	stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
 
 	raw_spin_lock_irqsave(&latency_lock, flags);
 
-- 
cgit v1.2.3


From 746017ed8d4d3c2070bb03aee9536f24da43c778 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:05 +0200
Subject: dma/debug: Simplify stracktrace retrieval

Replace the indirection through struct stack_trace with an invocation of
the storage array based interface.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094802.248658135@linutronix.de
---
 kernel/dma/debug.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index a218e43cc382..badd77670d00 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
 	int		 sg_mapped_ents;
 	enum map_err_types  map_err_type;
 #ifdef CONFIG_STACKTRACE
-	struct		 stack_trace stacktrace;
-	unsigned long	 st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+	unsigned int	stack_len;
+	unsigned long	stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
 #endif
 };
 
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 #ifdef CONFIG_STACKTRACE
 	if (entry) {
 		pr_warning("Mapped at:\n");
-		print_stack_trace(&entry->stacktrace, 0);
+		stack_trace_print(entry->stack_entries, entry->stack_len, 0);
 	}
 #endif
 }
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 
 #ifdef CONFIG_STACKTRACE
-	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
-	entry->stacktrace.entries = entry->st_entries;
-	entry->stacktrace.skip = 1;
-	save_stack_trace(&entry->stacktrace);
+	entry->stack_len = stack_trace_save(entry->stack_entries,
+					    ARRAY_SIZE(entry->stack_entries),
+					    1);
 #endif
-
 	return entry;
 }
 
-- 
cgit v1.2.3


From b1abe4622d4cc32b3b37cfefbc7ac070a8f868e0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:10 +0200
Subject: lockdep: Remove unused trace argument from print_circular_bug()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094802.716274532@linutronix.de
---
 kernel/locking/lockdep.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2edf9501d906..d7615d299d08 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1522,10 +1522,9 @@ static inline int class_equal(struct lock_list *entry, void *data)
 }
 
 static noinline int print_circular_bug(struct lock_list *this,
-				struct lock_list *target,
-				struct held_lock *check_src,
-				struct held_lock *check_tgt,
-				struct stack_trace *trace)
+				       struct lock_list *target,
+				       struct held_lock *check_src,
+				       struct held_lock *check_tgt)
 {
 	struct task_struct *curr = current;
 	struct lock_list *parent;
@@ -2206,7 +2205,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 			 */
 			save(trace);
 		}
-		return print_circular_bug(&this, target_entry, next, prev, trace);
+		return print_circular_bug(&this, target_entry, next, prev);
 	}
 	else if (unlikely(ret < 0))
 		return print_bfs_bug(ret);
-- 
cgit v1.2.3


From 76b14436b4d98903fef723365170bedd6f28ab2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:11 +0200
Subject: lockdep: Remove save argument from check_prev_add()

There is only one caller which hands in save_trace as function pointer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094802.803362058@linutronix.de
---
 kernel/locking/lockdep.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d7615d299d08..3603893d5bbd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2158,8 +2158,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, struct stack_trace *trace,
-	       int (*save)(struct stack_trace *trace))
+	       struct held_lock *next, int distance, struct stack_trace *trace)
 {
 	struct lock_list *uninitialized_var(target_entry);
 	struct lock_list *entry;
@@ -2199,11 +2198,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	if (unlikely(!ret)) {
 		if (!trace->entries) {
 			/*
-			 * If @save fails here, the printing might trigger
-			 * a WARN but because of the !nr_entries it should
-			 * not do bad things.
+			 * If save_trace fails here, the printing might
+			 * trigger a WARN but because of the !nr_entries it
+			 * should not do bad things.
 			 */
-			save(trace);
+			save_trace(trace);
 		}
 		return print_circular_bug(&this, target_entry, next, prev);
 	}
@@ -2253,7 +2252,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		return print_bfs_bug(ret);
 
 
-	if (!trace->entries && !save(trace))
+	if (!trace->entries && !save_trace(trace))
 		return 0;
 
 	/*
@@ -2318,7 +2317,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2 && hlock->check) {
-			int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+			int ret = check_prev_add(curr, hlock, next, distance,
+						 &trace);
 			if (!ret)
 				return 0;
 
-- 
cgit v1.2.3


From c120bce78065cbea460a58b1572c215db9c148ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:12 +0200
Subject: lockdep: Simplify stack trace handling

Replace the indirection through struct stack_trace by using the storage
array based interfaces and storing the information is a small lockdep
specific data structure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094802.891724020@linutronix.de
---
 kernel/locking/lockdep.c | 55 ++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3603893d5bbd..45bcaf2e4cb6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -434,18 +434,14 @@ static void print_lockdep_off(const char *bug_msg)
 #endif
 }
 
-static int save_trace(struct stack_trace *trace)
+static int save_trace(struct lock_trace *trace)
 {
-	trace->nr_entries = 0;
-	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
-	trace->entries = stack_trace + nr_stack_trace_entries;
-
-	trace->skip = 3;
-
-	save_stack_trace(trace);
-
-	trace->max_entries = trace->nr_entries;
+	unsigned long *entries = stack_trace + nr_stack_trace_entries;
+	unsigned int max_entries;
 
+	trace->offset = nr_stack_trace_entries;
+	max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+	trace->nr_entries = stack_trace_save(entries, max_entries, 3);
 	nr_stack_trace_entries += trace->nr_entries;
 
 	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -1196,7 +1192,7 @@ static struct lock_list *alloc_list_entry(void)
 static int add_lock_to_list(struct lock_class *this,
 			    struct lock_class *links_to, struct list_head *head,
 			    unsigned long ip, int distance,
-			    struct stack_trace *trace)
+			    struct lock_trace *trace)
 {
 	struct lock_list *entry;
 	/*
@@ -1415,6 +1411,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
  * checking.
  */
 
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+	unsigned long *entries = stack_trace + trace->offset;
+
+	stack_trace_print(entries, trace->nr_entries, spaces);
+}
+
 /*
  * Print a dependency chain entry (this is only done when a deadlock
  * has been detected):
@@ -1427,8 +1430,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
 	printk("\n-> #%u", depth);
 	print_lock_name(target->class);
 	printk(KERN_CONT ":\n");
-	print_stack_trace(&target->trace, 6);
-
+	print_lock_trace(&target->trace, 6);
 	return 0;
 }
 
@@ -1740,7 +1742,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 
 			len += printk("%*s   %s", depth, "", usage_str[bit]);
 			len += printk(KERN_CONT " at:\n");
-			print_stack_trace(class->usage_traces + bit, len);
+			print_lock_trace(class->usage_traces + bit, len);
 		}
 	}
 	printk("%*s }\n", depth, "");
@@ -1765,7 +1767,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
 	do {
 		print_lock_class_header(entry->class, depth);
 		printk("%*s ... acquired at:\n", depth, "");
-		print_stack_trace(&entry->trace, 2);
+		print_lock_trace(&entry->trace, 2);
 		printk("\n");
 
 		if (depth == 0 && (entry != root)) {
@@ -1878,14 +1880,14 @@ print_bad_irq_dependency(struct task_struct *curr,
 	print_lock_name(backwards_entry->class);
 	pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
 
-	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+	print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
 
 	pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
 	print_lock_name(forwards_entry->class);
 	pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
 	pr_warn("...");
 
-	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+	print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
 
 	pr_warn("\nother info that might help us debug this:\n\n");
 	print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -2158,7 +2160,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, struct stack_trace *trace)
+	       struct held_lock *next, int distance, struct lock_trace *trace)
 {
 	struct lock_list *uninitialized_var(target_entry);
 	struct lock_list *entry;
@@ -2196,7 +2198,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	this.parent = NULL;
 	ret = check_noncircular(&this, hlock_class(prev), &target_entry);
 	if (unlikely(!ret)) {
-		if (!trace->entries) {
+		if (!trace->nr_entries) {
 			/*
 			 * If save_trace fails here, the printing might
 			 * trigger a WARN but because of the !nr_entries it
@@ -2252,7 +2254,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		return print_bfs_bug(ret);
 
 
-	if (!trace->entries && !save_trace(trace))
+	if (!trace->nr_entries && !save_trace(trace))
 		return 0;
 
 	/*
@@ -2284,14 +2286,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
+	struct lock_trace trace = { .nr_entries = 0 };
 	int depth = curr->lockdep_depth;
 	struct held_lock *hlock;
-	struct stack_trace trace = {
-		.nr_entries = 0,
-		.max_entries = 0,
-		.entries = NULL,
-		.skip = 0,
-	};
 
 	/*
 	 * Debugging checks.
@@ -2719,6 +2716,10 @@ static inline int validate_chain(struct task_struct *curr,
 {
 	return 1;
 }
+
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+}
 #endif
 
 /*
@@ -2815,7 +2816,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 	print_lock(this);
 
 	pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
-	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+	print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
 
 	print_irqtrace_events(curr);
 	pr_warn("\nother info that might help us debug this:\n");
-- 
cgit v1.2.3


From e7d916632b528e8cccc8e9ccca81acfc591a5fde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:13 +0200
Subject: tracing: Simplify stacktrace retrieval in histograms

The indirection through struct stack_trace is not necessary at all. Use the
storage array based interface.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094802.979089273@linutronix.de
---
 kernel/trace/trace_events_hist.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 21ceae299f7e..a1d20421f4b0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5186,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
 	u64 var_ref_vals[TRACING_MAP_VARS_MAX];
 	char compound_key[HIST_KEY_SIZE_MAX];
 	struct tracing_map_elt *elt = NULL;
-	struct stack_trace stacktrace;
 	struct hist_field *key_field;
 	u64 field_contents;
 	void *key = NULL;
@@ -5198,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
 		key_field = hist_data->fields[i];
 
 		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
-			stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
-			stacktrace.entries = entries;
-			stacktrace.nr_entries = 0;
-			stacktrace.skip = HIST_STACKTRACE_SKIP;
-
-			memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
-			save_stack_trace(&stacktrace);
-
+			memset(entries, 0, HIST_STACKTRACE_SIZE);
+			stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+					 HIST_STACKTRACE_SKIP);
 			key = entries;
 		} else {
 			field_contents = key_field->fn(key_field, elt, rbe, rec);
-- 
cgit v1.2.3


From 2a820bf74918d61ea54f7c1001f4a6a2e457577c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:14 +0200
Subject: tracing: Use percpu stack trace buffer more intelligently

The per cpu stack trace buffer usage pattern is odd at best. The buffer has
place for 512 stack trace entries on 64-bit and 1024 on 32-bit. When
interrupts or exceptions nest after the per cpu buffer was acquired the
stacktrace length is hardcoded to 8 entries. 512/1024 stack trace entries
in kernel stacks are unrealistic so the buffer is a complete waste.

Split the buffer into 4 nest levels, which are 128/256 entries per
level. This allows nesting contexts (interrupts, exceptions) to utilize the
cpu buffer for stack retrieval and avoids the fixed length allocation along
with the conditional execution pathes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.066064076@linutronix.de
---
 kernel/trace/trace.c | 73 ++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..4fc93004feab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2749,12 +2749,21 @@ trace_function(struct trace_array *tr,
 
 #ifdef CONFIG_STACKTRACE
 
-#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
+#define FTRACE_KSTACK_NESTING	4
+
+#define FTRACE_KSTACK_ENTRIES	(PAGE_SIZE / FTRACE_KSTACK_NESTING)
+
 struct ftrace_stack {
-	unsigned long		calls[FTRACE_STACK_MAX_ENTRIES];
+	unsigned long		calls[FTRACE_KSTACK_ENTRIES];
+};
+
+
+struct ftrace_stacks {
+	struct ftrace_stack	stacks[FTRACE_KSTACK_NESTING];
 };
 
-static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
 static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2763,10 +2772,11 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 {
 	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
+	struct ftrace_stack *fstack;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	int use_stack;
-	int size = FTRACE_STACK_ENTRIES;
+	int size = FTRACE_KSTACK_ENTRIES;
+	int stackidx;
 
 	trace.nr_entries	= 0;
 	trace.skip		= skip;
@@ -2788,29 +2798,32 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	 */
 	preempt_disable_notrace();
 
-	use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
+
+	/* This should never happen. If it does, yell once and skip */
+	if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+		goto out;
+
 	/*
-	 * We don't need any atomic variables, just a barrier.
-	 * If an interrupt comes in, we don't care, because it would
-	 * have exited and put the counter back to what we want.
-	 * We just need a barrier to keep gcc from moving things
-	 * around.
+	 * The above __this_cpu_inc_return() is 'atomic' cpu local. An
+	 * interrupt will either see the value pre increment or post
+	 * increment. If the interrupt happens pre increment it will have
+	 * restored the counter when it returns.  We just need a barrier to
+	 * keep gcc from moving things around.
 	 */
 	barrier();
-	if (use_stack == 1) {
-		trace.entries		= this_cpu_ptr(ftrace_stack.calls);
-		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES;
 
-		if (regs)
-			save_stack_trace_regs(regs, &trace);
-		else
-			save_stack_trace(&trace);
+	fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
+	trace.entries		= fstack->calls;
+	trace.max_entries	= FTRACE_KSTACK_ENTRIES;
 
-		if (trace.nr_entries > size)
-			size = trace.nr_entries;
-	} else
-		/* From now on, use_stack is a boolean */
-		use_stack = 0;
+	if (regs)
+		save_stack_trace_regs(regs, &trace);
+	else
+		save_stack_trace(&trace);
+
+	if (trace.nr_entries > size)
+		size = trace.nr_entries;
 
 	size *= sizeof(unsigned long);
 
@@ -2820,19 +2833,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 		goto out;
 	entry = ring_buffer_event_data(event);
 
-	memset(&entry->caller, 0, size);
-
-	if (use_stack)
-		memcpy(&entry->caller, trace.entries,
-		       trace.nr_entries * sizeof(unsigned long));
-	else {
-		trace.max_entries	= FTRACE_STACK_ENTRIES;
-		trace.entries		= entry->caller;
-		if (regs)
-			save_stack_trace_regs(regs, &trace);
-		else
-			save_stack_trace(&trace);
-	}
+	memcpy(&entry->caller, trace.entries, size);
 
 	entry->size = trace.nr_entries;
 
-- 
cgit v1.2.3


From c438f140cc16d47fac808d893f5017f6d641cb46 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:15 +0200
Subject: tracing: Make ftrace_trace_userstack() static and conditional

It's only used in trace.c and there is absolutely no point in compiling it
in when user space stack traces are not supported.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.162400595@linutronix.de
---
 kernel/trace/trace.c | 14 ++++++++------
 kernel/trace/trace.h |  8 --------
 2 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4fc93004feab..d8369d27c1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
 #endif /* CONFIG_TRACE_EVAL_MAP_FILE */
 
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+				   unsigned long flags, int pc);
 
 #define MAX_TRACER_SIZE		100
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -2905,9 +2907,10 @@ void trace_dump_stack(int skip)
 }
 EXPORT_SYMBOL_GPL(trace_dump_stack);
 
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
 static DEFINE_PER_CPU(int, user_stack_count);
 
-void
+static void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
 	struct trace_event_call *call = &event_user_stack;
@@ -2958,13 +2961,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
  out:
 	preempt_enable();
 }
-
-#ifdef UNUSED
-static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#else /* CONFIG_USER_STACKTRACE_SUPPORT */
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+				   unsigned long flags, int pc)
 {
-	ftrace_trace_userstack(tr, flags, preempt_count());
 }
-#endif /* UNUSED */
+#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
 
 #endif /* CONFIG_STACKTRACE */
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..639047b259d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
 #ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
-			    int pc);
-
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc);
 #else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
-					  unsigned long flags, int pc)
-{
-}
-
 static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 				 int skip, int pc)
 {
-- 
cgit v1.2.3


From ee6dd0db4d8de41a0a0bc37d8d87a0b1623f83b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:16 +0200
Subject: tracing: Simplify stack trace retrieval

Replace the indirection through struct stack_trace by using the storage
array based interfaces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.248604594@linutronix.de
---
 kernel/trace/trace.c | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d8369d27c1af..0ce8515dd470 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2774,22 +2774,18 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 {
 	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
+	unsigned int size, nr_entries;
 	struct ftrace_stack *fstack;
 	struct stack_entry *entry;
-	struct stack_trace trace;
-	int size = FTRACE_KSTACK_ENTRIES;
 	int stackidx;
 
-	trace.nr_entries	= 0;
-	trace.skip		= skip;
-
 	/*
 	 * Add one, for this function and the call to save_stack_trace()
 	 * If regs is set, then these functions will not be in the way.
 	 */
 #ifndef CONFIG_UNWINDER_ORC
 	if (!regs)
-		trace.skip++;
+		skip++;
 #endif
 
 	/*
@@ -2816,28 +2812,24 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	barrier();
 
 	fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
-	trace.entries		= fstack->calls;
-	trace.max_entries	= FTRACE_KSTACK_ENTRIES;
-
-	if (regs)
-		save_stack_trace_regs(regs, &trace);
-	else
-		save_stack_trace(&trace);
-
-	if (trace.nr_entries > size)
-		size = trace.nr_entries;
+	size = ARRAY_SIZE(fstack->calls);
 
-	size *= sizeof(unsigned long);
+	if (regs) {
+		nr_entries = stack_trace_save_regs(regs, fstack->calls,
+						   size, skip);
+	} else {
+		nr_entries = stack_trace_save(fstack->calls, size, skip);
+	}
 
+	size = nr_entries * sizeof(unsigned long);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
 					    sizeof(*entry) + size, flags, pc);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
 
-	memcpy(&entry->caller, trace.entries, size);
-
-	entry->size = trace.nr_entries;
+	memcpy(&entry->caller, fstack->calls, size);
+	entry->size = nr_entries;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
 		__buffer_unlock_commit(buffer, event);
@@ -2916,7 +2908,6 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
-	struct stack_trace trace;
 
 	if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
@@ -2947,12 +2938,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	entry->tgid		= current->tgid;
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
-	trace.nr_entries	= 0;
-	trace.max_entries	= FTRACE_STACK_ENTRIES;
-	trace.skip		= 0;
-	trace.entries		= entry->caller;
-
-	save_stack_trace_user(&trace);
+	stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
 	if (!call_filter_check_discard(call, entry, buffer, event))
 		__buffer_unlock_commit(buffer, event);
 
-- 
cgit v1.2.3


From 9f50c91b1195dfffd183d5d8505e45af86623532 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:17 +0200
Subject: tracing: Remove the last struct stack_trace usage

Simplify the stack retrieval code by using the storage array based
interface.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.340000461@linutronix.de
---
 kernel/trace/trace_stack.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4efda5f75a0f..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -23,11 +23,7 @@
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
 static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
 
-struct stack_trace stack_trace_max = {
-	.max_entries		= STACK_TRACE_ENTRIES,
-	.entries		= &stack_dump_trace[0],
-};
-
+static unsigned int stack_trace_nr_entries;
 static unsigned long stack_trace_max_size;
 static arch_spinlock_t stack_trace_max_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -44,10 +40,10 @@ static void print_max_stack(void)
 
 	pr_emerg("        Depth    Size   Location    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   stack_trace_max.nr_entries);
+			   stack_trace_nr_entries);
 
-	for (i = 0; i < stack_trace_max.nr_entries; i++) {
-		if (i + 1 == stack_trace_max.nr_entries)
+	for (i = 0; i < stack_trace_nr_entries; i++) {
+		if (i + 1 == stack_trace_nr_entries)
 			size = stack_trace_index[i];
 		else
 			size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -93,13 +89,12 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 
 	stack_trace_max_size = this_size;
 
-	stack_trace_max.nr_entries = 0;
-	stack_trace_max.skip = 0;
-
-	save_stack_trace(&stack_trace_max);
+	stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
+					       ARRAY_SIZE(stack_dump_trace) - 1,
+					       0);
 
 	/* Skip over the overhead of the stack tracer itself */
-	for (i = 0; i < stack_trace_max.nr_entries; i++) {
+	for (i = 0; i < stack_trace_nr_entries; i++) {
 		if (stack_dump_trace[i] == ip)
 			break;
 	}
@@ -108,7 +103,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 	 * Some archs may not have the passed in ip in the dump.
 	 * If that happens, we need to show everything.
 	 */
-	if (i == stack_trace_max.nr_entries)
+	if (i == stack_trace_nr_entries)
 		i = 0;
 
 	/*
@@ -126,13 +121,13 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 	 * loop will only happen once. This code only takes place
 	 * on a new max, so it is far from a fast path.
 	 */
-	while (i < stack_trace_max.nr_entries) {
+	while (i < stack_trace_nr_entries) {
 		int found = 0;
 
 		stack_trace_index[x] = this_size;
 		p = start;
 
-		for (; p < top && i < stack_trace_max.nr_entries; p++) {
+		for (; p < top && i < stack_trace_nr_entries; p++) {
 			/*
 			 * The READ_ONCE_NOCHECK is used to let KASAN know that
 			 * this is not a stack-out-of-bounds error.
@@ -163,7 +158,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 			i++;
 	}
 
-	stack_trace_max.nr_entries = x;
+	stack_trace_nr_entries = x;
 
 	if (task_stack_end_corrupted(current)) {
 		print_max_stack();
@@ -265,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
 	long n = *pos - 1;
 
-	if (n >= stack_trace_max.nr_entries)
+	if (n >= stack_trace_nr_entries)
 		return NULL;
 
 	m->private = (void *)n;
@@ -329,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   stack_trace_max.nr_entries);
+			   stack_trace_nr_entries);
 
 		if (!stack_tracer_enabled && !stack_trace_max_size)
 			print_disabled(m);
@@ -339,10 +334,10 @@ static int t_show(struct seq_file *m, void *v)
 
 	i = *(long *)v;
 
-	if (i >= stack_trace_max.nr_entries)
+	if (i >= stack_trace_nr_entries)
 		return 0;
 
-	if (i + 1 == stack_trace_max.nr_entries)
+	if (i + 1 == stack_trace_nr_entries)
 		size = stack_trace_index[i];
 	else
 		size = stack_trace_index[i] - stack_trace_index[i+1];
-- 
cgit v1.2.3


From 25e39e32b0a3f99b9db320605f20f91d425b6a65 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:18 +0200
Subject: livepatch: Simplify stack trace retrieval

Replace the indirection through struct stack_trace by using the storage
array based interfaces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.437950229@linutronix.de
---
 kernel/livepatch/transition.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 9c89ae8b337a..c53370d596be 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)
  * Determine whether the given stack trace includes any references to a
  * to-be-patched or to-be-unpatched function.
  */
-static int klp_check_stack_func(struct klp_func *func,
-				struct stack_trace *trace)
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
+				unsigned int nr_entries)
 {
 	unsigned long func_addr, func_size, address;
 	struct klp_ops *ops;
 	int i;
 
-	for (i = 0; i < trace->nr_entries; i++) {
-		address = trace->entries[i];
+	for (i = 0; i < nr_entries; i++) {
+		address = entries[i];
 
 		if (klp_target_state == KLP_UNPATCHED) {
 			 /*
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,
 static int klp_check_stack(struct task_struct *task, char *err_buf)
 {
 	static unsigned long entries[MAX_STACK_ENTRIES];
-	struct stack_trace trace;
 	struct klp_object *obj;
 	struct klp_func *func;
-	int ret;
+	int ret, nr_entries;
 
-	trace.skip = 0;
-	trace.nr_entries = 0;
-	trace.max_entries = MAX_STACK_ENTRIES;
-	trace.entries = entries;
-	ret = save_stack_trace_tsk_reliable(task, &trace);
+	ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
 	WARN_ON_ONCE(ret == -ENOSYS);
-	if (ret) {
+	if (ret < 0) {
 		snprintf(err_buf, STACK_ERR_BUF_SIZE,
 			 "%s: %s:%d has an unreliable stack\n",
 			 __func__, task->comm, task->pid);
 		return ret;
 	}
+	nr_entries = ret;
 
 	klp_for_each_object(klp_transition_patch, obj) {
 		if (!obj->patched)
 			continue;
 		klp_for_each_func(obj, func) {
-			ret = klp_check_stack_func(func, &trace);
+			ret = klp_check_stack_func(func, entries, nr_entries);
 			if (ret) {
 				snprintf(err_buf, STACK_ERR_BUF_SIZE,
 					 "%s: %s:%d is sleeping on function %s\n",
-- 
cgit v1.2.3


From 988ec8841ca1e22b2978fce0134d8267e838770e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:19 +0200
Subject: stacktrace: Remove obsolete functions

No more users of the struct stack_trace based interfaces. Remove them.

Remove the macro stubs for !CONFIG_STACKTRACE as well as they are pointless
because the storage on the call sites is conditional on CONFIG_STACKTRACE
already. No point to be 'smart'.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: linux-arch@vger.kernel.org
Link: https://lkml.kernel.org/r/20190425094803.524796783@linutronix.de
---
 kernel/stacktrace.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b38333b3bc18..dd55312f3fe9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -30,12 +30,6 @@ void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
 }
 EXPORT_SYMBOL_GPL(stack_trace_print);
 
-void print_stack_trace(struct stack_trace *trace, int spaces)
-{
-	stack_trace_print(trace->entries, trace->nr_entries, spaces);
-}
-EXPORT_SYMBOL_GPL(print_stack_trace);
-
 /**
  * stack_trace_snprint - Print the entries in the stack trace into a buffer
  * @buf:	Pointer to the print buffer
@@ -72,14 +66,6 @@ int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
 }
 EXPORT_SYMBOL_GPL(stack_trace_snprint);
 
-int snprint_stack_trace(char *buf, size_t size,
-			struct stack_trace *trace, int spaces)
-{
-	return stack_trace_snprint(buf, size, trace->entries,
-				   trace->nr_entries, spaces);
-}
-EXPORT_SYMBOL_GPL(snprint_stack_trace);
-
 /*
  * Architectures that do not implement save_stack_trace_*()
  * get these weak aliases and once-per-bootup warnings
-- 
cgit v1.2.3


From 214d8ca6ee854f696f75e75511fe66b409e656db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Apr 2019 11:45:21 +0200
Subject: stacktrace: Provide common infrastructure

All architectures which support stacktrace carry duplicated code and
do the stack storage and filtering at the architecture side.

Provide a consolidated interface with a callback function for consuming the
stack entries provided by the architecture specific stack walker. This
removes lots of duplicated code and allows to implement better filtering
than 'skip number of entries' in the future without touching any
architecture specific code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-arch@vger.kernel.org
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: linux-mm@kvack.org
Cc: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: kasan-dev@googlegroups.com
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: linux-btrfs@vger.kernel.org
Cc: dm-devel@redhat.com
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: intel-gfx@lists.freedesktop.org
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: dri-devel@lists.freedesktop.org
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Link: https://lkml.kernel.org/r/20190425094803.713568606@linutronix.de
---
 kernel/stacktrace.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index dd55312f3fe9..27bafc1e271e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -5,6 +5,8 @@
  *
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
@@ -66,6 +68,175 @@ int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
 }
 EXPORT_SYMBOL_GPL(stack_trace_snprint);
 
+#ifdef CONFIG_ARCH_STACKWALK
+
+struct stacktrace_cookie {
+	unsigned long	*store;
+	unsigned int	size;
+	unsigned int	skip;
+	unsigned int	len;
+};
+
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
+				      bool reliable)
+{
+	struct stacktrace_cookie *c = cookie;
+
+	if (c->len >= c->size)
+		return false;
+
+	if (c->skip > 0) {
+		c->skip--;
+		return true;
+	}
+	c->store[c->len++] = addr;
+	return c->len < c->size;
+}
+
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
+					      bool reliable)
+{
+	if (in_sched_functions(addr))
+		return true;
+	return stack_trace_consume_entry(cookie, addr, reliable);
+}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+			      unsigned int skipnr)
+{
+	stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+	struct stacktrace_cookie c = {
+		.store	= store,
+		.size	= size,
+		.skip	= skipnr + 1,
+	};
+
+	arch_stack_walk(consume_entry, &c, current, NULL);
+	return c.len;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task:	The task to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+				  unsigned int size, unsigned int skipnr)
+{
+	stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
+	struct stacktrace_cookie c = {
+		.store	= store,
+		.size	= size,
+		.skip	= skipnr + 1,
+	};
+
+	if (!try_get_task_stack(tsk))
+		return 0;
+
+	arch_stack_walk(consume_entry, &c, tsk, NULL);
+	put_task_stack(tsk);
+	return c.len;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs:	Pointer to pt_regs to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ * @skipnr:	Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+				   unsigned int size, unsigned int skipnr)
+{
+	stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+	struct stacktrace_cookie c = {
+		.store	= store,
+		.size	= size,
+		.skip	= skipnr,
+	};
+
+	arch_stack_walk(consume_entry, &c, current, regs);
+	return c.len;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk:	Pointer to the task to examine
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ *
+ * Return:	An error if it detects any unreliable features of the
+ *		stack. Otherwise it guarantees that the stack trace is
+ *		reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+				  unsigned int size)
+{
+	stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+	struct stacktrace_cookie c = {
+		.store	= store,
+		.size	= size,
+	};
+	int ret;
+
+	/*
+	 * If the task doesn't have a stack (e.g., a zombie), the stack is
+	 * "reliably" empty.
+	 */
+	if (!try_get_task_stack(tsk))
+		return 0;
+
+	ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
+	put_task_stack(tsk);
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store:	Pointer to storage array
+ * @size:	Size of the storage array
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+	stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+	struct stacktrace_cookie c = {
+		.store	= store,
+		.size	= size,
+	};
+
+	/* Trace user stack if not a kernel thread */
+	if (!current->mm)
+		return 0;
+
+	arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+	return c.len;
+}
+#endif
+
+#else /* CONFIG_ARCH_STACKWALK */
+
 /*
  * Architectures that do not implement save_stack_trace_*()
  * get these weak aliases and once-per-bootup warnings
@@ -203,3 +374,5 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 	return trace.nr_entries;
 }
 #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
+
+#endif /* !CONFIG_ARCH_STACKWALK */
-- 
cgit v1.2.3


From 31adf2308f33dcae59009019675224be0978bc70 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 24 Apr 2019 10:55:48 +0200
Subject: livepatch: Convert error about unsupported reliable stacktrace into a
 warning

The commit d0807da78e11d46f ("livepatch: Remove immediate feature") caused
that any livepatch was refused when reliable stacktraces were not supported
on the given architecture.

The limitation is too strong. User space processes are safely migrated
even when entering or leaving the kernel. Kthreads transition would
need to get forced. But it is safe when:

   + The livepatch does not change the semantic of the code.
   + Callbacks do not depend on a safely finished transition.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/livepatch/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb0ee10a1981..14f33ab6c583 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1003,11 +1003,10 @@ int klp_enable_patch(struct klp_patch *patch)
 		return -ENODEV;
 
 	if (!klp_have_reliable_stack()) {
-		pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
-		return -EOPNOTSUPP;
+		pr_warn("This architecture doesn't have support for the livepatch consistency model.\n");
+		pr_warn("The livepatch transition may never complete.\n");
 	}
 
-
 	mutex_lock(&klp_mutex);
 
 	ret = klp_init_patch_early(patch);
-- 
cgit v1.2.3


From d671002be6bdd7f77a771e23bf3e95d1f16775e6 Mon Sep 17 00:00:00 2001
From: zhengbin <zhengbin13@huawei.com>
Date: Mon, 29 Apr 2019 20:26:31 +0800
Subject: locking/lockdep: Remove unnecessary unlikely()

DEBUG_LOCKS_WARN_ON() already contains an unlikely(), there is no need
for another one.

Signed-off-by: zhengbin <zhengbin13@huawei.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: houtao1@huawei.com
Link: http://lkml.kernel.org/r/1556540791-23110-1-git-send-email-zhengbin13@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 25ecc6d3058b..6426d071a324 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3256,7 +3256,7 @@ void lockdep_hardirqs_on(unsigned long ip)
 	/*
 	 * See the fine text that goes along with this variable definition.
 	 */
-	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+	if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 08970ecf744e09837bb6620c95406710f4c81ae2 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Thu, 18 Apr 2019 16:54:01 +0100
Subject: irq/irqdomain: Fix typo in the comment on top of
 __irq_domain_alloc_irqs()

The word 'number' has been misspelt in the comment on top of
_irq_domain_alloc_irqs().

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/irqdomain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9ed29e4a7dbf..a453e229f99c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
 /**
  * __irq_domain_alloc_irqs - Allocate IRQs from domain
  * @domain:	domain to allocate from
- * @irq_base:	allocate specified IRQ nubmer if irq_base >= 0
+ * @irq_base:	allocate specified IRQ number if irq_base >= 0
  * @nr_irqs:	number of IRQs to allocate
  * @node:	NUMA node id for memory allocation
  * @arg:	domain specific argument
-- 
cgit v1.2.3


From 43d8ce9d65a54846d378545770991e65838981e0 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Fri, 26 Apr 2019 15:04:29 -0400
Subject: Provide in-kernel headers to make extending kernel easier

Introduce in-kernel headers which are made available as an archive
through proc (/proc/kheaders.tar.xz file). This archive makes it
possible to run eBPF and other tracing programs that need to extend the
kernel for tracing purposes without any dependency on the file system
having headers.

A github PR is sent for the corresponding BCC patch at:
https://github.com/iovisor/bcc/pull/2312

On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Further once a
different kernel is booted, any headers stored on the file system will
no longer be useful. This is an issue even well known to distros.
By storing the headers as a compressed archive within the kernel, we can
avoid these issues that have been a hindrance for a long time.

The best way to use this feature is by building it in. Several users
have a need for this, when they switch debug kernels, they do not want to
update the filesystem or worry about it where to store the headers on
it. However, the feature is also buildable as a module in case the user
desires it not being part of the kernel image. This makes it possible to
load and unload the headers from memory on demand. A tracing program can
load the module, do its operations, and then unload the module to save
kernel memory. The total memory needed is 3.3MB.

By having the archive available at a fixed location independent of
filesystem dependencies and conventions, all debugging tools can
directly refer to the fixed location for the archive, without concerning
with where the headers on a typical filesystem which significantly
simplifies tooling that needs kernel headers.

The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.

Other approaches were discussed such as having an in-memory mountable
filesystem, but that has drawbacks such as requiring an in-kernel xz
decompressor which we don't have today, and requiring usage of 42 MB of
kernel memory to host the decompressed headers at anytime. Also this
approach is simpler than such approaches.

Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/.gitignore      |  1 +
 kernel/Makefile        | 10 ++++++
 kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kheaders.c      | 74 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 174 insertions(+)
 create mode 100755 kernel/gen_ikh_data.sh
 create mode 100644 kernel/kheaders.c

(limited to 'kernel')

diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
 #
 # Generated files
 #
+kheaders.md5
 timeconst.h
 hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..12399614c350 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
 targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 	$(call if_changed,gzip)
+
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
+
+quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+	$(call cmd,genikh)
+
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
new file mode 100755
index 000000000000..591a94f7b387
--- /dev/null
+++ b/kernel/gen_ikh_data.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS_PROC.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# iter=1
+# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else; 	iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f                       |
+		grep -v "include/generated/compile.h"		   |
+		xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f                       |
+		grep -v "include/generated/compile.h"		   |
+		xargs ls -lR | md5sum | cut -d ' ' -f1)"
+
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+	[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+	[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+	[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+		exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+       echo "  GEN     $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $kroot > /dev/null
+for f in $src_file_list;
+	do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+	do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+	xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..70ae6052920d
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+"	.pushsection .rodata, \"a\"		\n"
+"	.global kernel_headers_data		\n"
+"kernel_headers_data:				\n"
+"	.incbin \"kernel/kheaders_data.tar.xz\"	\n"
+"	.global kernel_headers_data_end		\n"
+"kernel_headers_data_end:			\n"
+"	.popsection				\n"
+);
+
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+
+static ssize_t
+ikheaders_read_current(struct file *file, char __user *buf,
+		      size_t len, loff_t *offset)
+{
+	return simple_read_from_buffer(buf, len, offset,
+				       &kernel_headers_data,
+				       &kernel_headers_data_end -
+				       &kernel_headers_data);
+}
+
+static const struct file_operations ikheaders_file_ops = {
+	.read = ikheaders_read_current,
+	.llseek = default_llseek,
+};
+
+static int __init ikheaders_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	/* create the current headers file */
+	entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
+			    &ikheaders_file_ops);
+	if (!entry)
+		return -ENOMEM;
+
+	proc_set_size(entry,
+		      &kernel_headers_data_end -
+		      &kernel_headers_data);
+	return 0;
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+	remove_proc_entry("kheaders.tar.xz", NULL);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
-- 
cgit v1.2.3


From c7b6f29b6257532792fc722b68fcc0e00b5a856c Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 25 Apr 2019 17:11:43 -0700
Subject: bpf: Fail bpf_probe_write_user() while mm is switched

When using a temporary mm, bpf_probe_write_user() should not be able to
write to user memory, since user memory addresses may be used to map
kernel memory.  Detect these cases and fail bpf_probe_write_user() in
such cases.

Suggested-by: Jann Horn <jannh@google.com>
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190426001143.4983-24-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/bpf_trace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
+#include <asm/tlb.h>
+
 #include "trace_probe.h"
 #include "trace.h"
 
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
 	 * access_ok() should prevent writing to non-user memory, but in
 	 * some situations (nommu, temporary switch, etc) access_ok() does
 	 * not provide enough validation, hence the check on KERNEL_DS.
+	 *
+	 * nmi_uaccess_okay() ensures the probe is not run in an interim
+	 * state, when the task or mm are switched. This is specifically
+	 * required to prevent the use of temporary mm.
 	 */
 
 	if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
 		return -EPERM;
 	if (unlikely(uaccess_kernel()))
 		return -EPERM;
+	if (unlikely(!nmi_uaccess_okay()))
+		return -EPERM;
 	if (!access_ok(unsafe_ptr, size))
 		return -EPERM;
 
-- 
cgit v1.2.3


From aad42dd44db086c79ca3f470ad563d2ac4ac218d Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Fri, 26 Apr 2019 16:22:44 -0700
Subject: uprobes: Initialize uprobes earlier

In order to have a separate address space for text poking, we need to
duplicate init_mm early during start_kernel(). This, however, introduces
a problem since uprobes functions are called from dup_mmap(), but
uprobes is still not initialized in this early stage.

Since uprobes initialization is necassary for fork, and since all the
dependant initialization has been done when fork is initialized (percpu
and vmalloc), move uprobes initialization to fork_init(). It does not
seem uprobes introduces any security problem for the poking_mm.

Crash and burn if uprobes initialization fails, similarly to other early
initializations. Change the init_probes() name to probes_init() to match
other early initialization functions name convention.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: ard.biesheuvel@linaro.org
Cc: deneen.t.dock@intel.com
Cc: kernel-hardening@lists.openwall.com
Cc: kristen@linux.intel.com
Cc: linux_dti@icloud.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190426232303.28381-6-nadav.amit@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 8 +++-----
 kernel/fork.c           | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..e6a0d6be87e3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
 	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */
 };
 
-static int __init init_uprobes(void)
+void __init uprobes_init(void)
 {
 	int i;
 
 	for (i = 0; i < UPROBES_HASH_SZ; i++)
 		mutex_init(&uprobes_mmap_mutex[i]);
 
-	if (percpu_init_rwsem(&dup_mmap_sem))
-		return -ENOMEM;
+	BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
 
-	return register_die_notifier(&uprobe_exception_nb);
+	BUG_ON(register_die_notifier(&uprobe_exception_nb));
 }
-__initcall(init_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..44fba5e5e916 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ void __init fork_init(void)
 #endif
 
 	lockdep_init_task(&init_task);
+	uprobes_init();
 }
 
 int __weak arch_dup_task_struct(struct task_struct *dst,
-- 
cgit v1.2.3


From 13585fa0668c724efab9635aaeef6ec390217415 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 25 Apr 2019 17:11:25 -0700
Subject: fork: Provide a function for copying init_mm

Provide a function for copying init_mm. This function will be later used
for setting a temporary mm.

Tested-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190426001143.4983-6-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 44fba5e5e916..fbe9dfcd8680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1299,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		complete_vfork_done(tsk);
 }
 
-/*
- * Allocate a new mm structure and copy contents from the
- * mm structure of the passed in task structure.
+/**
+ * dup_mm() - duplicates an existing mm structure
+ * @tsk: the task_struct with which the new mm will be associated.
+ * @oldmm: the mm to duplicate.
+ *
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
+ * content into it.
+ *
+ * Return: the duplicated mm or NULL on failure.
  */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk,
+				struct mm_struct *oldmm)
 {
-	struct mm_struct *mm, *oldmm = current->mm;
+	struct mm_struct *mm;
 	int err;
 
 	mm = allocate_mm();
@@ -1372,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	}
 
 	retval = -ENOMEM;
-	mm = dup_mm(tsk);
+	mm = dup_mm(tsk, current->mm);
 	if (!mm)
 		goto fail_nomem;
 
@@ -2187,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
 	return task;
 }
 
+struct mm_struct *copy_init_mm(void)
+{
+	return dup_mm(NULL, &init_mm);
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
cgit v1.2.3


From f2c65fb3221adc6b73b0549fc7ba892022db9797 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 25 Apr 2019 17:11:31 -0700
Subject: x86/modules: Avoid breaking W^X while loading modules

When modules and BPF filters are loaded, there is a time window in
which some memory is both writable and executable. An attacker that has
already found another vulnerability (e.g., a dangling pointer) might be
able to exploit this behavior to overwrite kernel code. Prevent having
writable executable PTEs in this stage.

In addition, avoiding having W+X mappings can also slightly simplify the
patching of modules code on initialization (e.g., by alternatives and
static-key), as would be done in the next patch. This was actually the
main motivation for this patch.

To avoid having W+X mappings, set them initially as RW (NX) and after
they are set as RO set them as X as well. Setting them as executable is
done as a separate step to avoid one core in which the old PTE is cached
(hence writable), and another which sees the updated PTE (executable),
which would break the W^X protection.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: https://lkml.kernel.org/r/20190426001143.4983-12-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/module.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..2b2845ae983e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1950,8 +1950,13 @@ void module_enable_ro(const struct module *mod, bool after_init)
 		return;
 
 	frob_text(&mod->core_layout, set_memory_ro);
+	frob_text(&mod->core_layout, set_memory_x);
+
 	frob_rodata(&mod->core_layout, set_memory_ro);
+
 	frob_text(&mod->init_layout, set_memory_ro);
+	frob_text(&mod->init_layout, set_memory_x);
+
 	frob_rodata(&mod->init_layout, set_memory_ro);
 
 	if (after_init)
-- 
cgit v1.2.3


From d63326928611600ad65baff54a70f53b02b3cdfe Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Thu, 25 Apr 2019 17:11:35 -0700
Subject: mm/hibernation: Make hibernation handle unmapped pages

Make hibernate handle unmapped pages on the direct map when
CONFIG_ARCH_HAS_SET_ALIAS=y is set. These functions allow for setting pages
to invalid configurations, so now hibernate should check if the pages have
valid mappings and handle if they are unmapped when doing a hibernate
save operation.

Previously this checking was already done when CONFIG_DEBUG_PAGEALLOC=y
was configured. It does not appear to have a big hibernating performance
impact. The speed of the saving operation before this change was measured
as 819.02 MB/s, and after was measured at 813.32 MB/s.

Before:
[    4.670938] PM: Wrote 171996 kbytes in 0.21 seconds (819.02 MB/s)

After:
[    4.504714] PM: Wrote 178932 kbytes in 0.22 seconds (813.32 MB/s)

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190426001143.4983-16-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/power/snapshot.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
  * safe_copy_page - Copy a page in a safe way.
  *
  * Check if the page we are going to copy is marked as present in the kernel
- * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
- * and in that case kernel_page_present() always returns 'true').
+ * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
+ * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
+ * always returns 'true'.
  */
 static void safe_copy_page(void *dst, struct page *s_page)
 {
-- 
cgit v1.2.3


From 1a7b7d9220819afe79d1ec5d759fe4349bd2453e Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Thu, 25 Apr 2019 17:11:37 -0700
Subject: modules: Use vmalloc special flag

Use new flag for handling freeing of special permissioned memory in vmalloc
and remove places where memory was set RW before freeing which is no longer
needed.

Since freeing of VM_FLUSH_RESET_PERMS memory is not supported in an
interrupt by vmalloc, the freeing of init sections is moved to a work
queue. Instead of call_rcu it now uses synchronize_rcu() in the work
queue.

Lastly, there is now a WARN_ON in module_memfree since it should not be
called in an interrupt with special memory as is required for
VM_FLUSH_RESET_PERMS.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190426001143.4983-18-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/module.c | 77 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2b2845ae983e..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
+/* Work queue for freeing init sections in success case */
+static struct work_struct init_free_wq;
+static struct llist_head init_free_list;
+
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 
 /*
@@ -1949,6 +1953,8 @@ void module_enable_ro(const struct module *mod, bool after_init)
 	if (!rodata_enabled)
 		return;
 
+	set_vm_flush_reset_perms(mod->core_layout.base);
+	set_vm_flush_reset_perms(mod->init_layout.base);
 	frob_text(&mod->core_layout, set_memory_ro);
 	frob_text(&mod->core_layout, set_memory_x);
 
@@ -1972,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
 	frob_writable_data(&mod->init_layout, set_memory_nx);
 }
 
-static void module_disable_nx(const struct module *mod)
-{
-	frob_rodata(&mod->core_layout, set_memory_x);
-	frob_ro_after_init(&mod->core_layout, set_memory_x);
-	frob_writable_data(&mod->core_layout, set_memory_x);
-	frob_rodata(&mod->init_layout, set_memory_x);
-	frob_writable_data(&mod->init_layout, set_memory_x);
-}
-
 /* Iterate through all modules and set each module's text as RW */
 void set_all_modules_text_rw(void)
 {
@@ -2024,23 +2021,8 @@ void set_all_modules_text_ro(void)
 	}
 	mutex_unlock(&module_mutex);
 }
-
-static void disable_ro_nx(const struct module_layout *layout)
-{
-	if (rodata_enabled) {
-		frob_text(layout, set_memory_rw);
-		frob_rodata(layout, set_memory_rw);
-		frob_ro_after_init(layout, set_memory_rw);
-	}
-	frob_rodata(layout, set_memory_x);
-	frob_ro_after_init(layout, set_memory_x);
-	frob_writable_data(layout, set_memory_x);
-}
-
 #else
-static void disable_ro_nx(const struct module_layout *layout) { }
 static void module_enable_nx(const struct module *mod) { }
-static void module_disable_nx(const struct module *mod) { }
 #endif
 
 #ifdef CONFIG_LIVEPATCH
@@ -2120,6 +2102,11 @@ static void free_module_elf(struct module *mod)
 
 void __weak module_memfree(void *module_region)
 {
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
 	vfree(module_region);
 }
 
@@ -2171,7 +2158,6 @@ static void free_module(struct module *mod)
 	mutex_unlock(&module_mutex);
 
 	/* This may be empty, but that's OK */
-	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
 	kfree(mod->args);
@@ -2181,7 +2167,6 @@ static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	disable_ro_nx(&mod->core_layout);
 	module_memfree(mod->core_layout.base);
 }
 
@@ -3420,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
 
 /* For freeing module_init on success, in case kallsyms traversing */
 struct mod_initfree {
-	struct rcu_head rcu;
+	struct llist_node node;
 	void *module_init;
 };
 
-static void do_free_init(struct rcu_head *head)
+static void do_free_init(struct work_struct *w)
 {
-	struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
-	module_memfree(m->module_init);
-	kfree(m);
+	struct llist_node *pos, *n, *list;
+	struct mod_initfree *initfree;
+
+	list = llist_del_all(&init_free_list);
+
+	synchronize_rcu();
+
+	llist_for_each_safe(pos, n, list) {
+		initfree = container_of(pos, struct mod_initfree, node);
+		module_memfree(initfree->module_init);
+		kfree(initfree);
+	}
 }
 
+static int __init modules_wq_init(void)
+{
+	INIT_WORK(&init_free_wq, do_free_init);
+	init_llist_head(&init_free_list);
+	return 0;
+}
+module_init(modules_wq_init);
+
 /*
  * This is where the real work happens.
  *
@@ -3507,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
 #endif
 	module_enable_ro(mod, true);
 	mod_tree_remove_init(mod);
-	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
 	mod->init_layout.base = NULL;
 	mod->init_layout.size = 0;
@@ -3518,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path, so use actual RCU here.
+	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * work and call synchronize_rcu() in a work queue.
+	 *
 	 * Note that module_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work - ie
 	 * rcu_barrier()
 	 */
-	call_rcu(&freeinit->rcu, do_free_init);
+	if (llist_add(&freeinit->node, &init_free_list))
+		schedule_work(&init_free_wq);
+
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
@@ -3822,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	module_bug_cleanup(mod);
 	mutex_unlock(&module_mutex);
 
-	/* we can't deallocate the module until we clear memory protection */
-	module_disable_ro(mod);
-	module_disable_nx(mod);
-
  ddebug_cleanup:
 	ftrace_release_mod(mod);
 	dynamic_debug_remove(mod, info->debug);
-- 
cgit v1.2.3


From d53d2f78ceadba081fc7785570798c3c8d50a718 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Thu, 25 Apr 2019 17:11:38 -0700
Subject: bpf: Use vmalloc special flag

Use new flag VM_FLUSH_RESET_PERMS for handling freeing of special
permissioned memory in vmalloc and remove places where memory was set RW
before freeing which is no longer needed. Don't track if the memory is RO
anymore because it is now tracked in vmalloc.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <ard.biesheuvel@linaro.org>
Cc: <deneen.t.dock@intel.com>
Cc: <kernel-hardening@lists.openwall.com>
Cc: <kristen@linux.intel.com>
Cc: <linux_dti@icloud.com>
Cc: <will.deacon@arm.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190426001143.4983-19-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/bpf/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	if (fp->jited) {
 		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
-		bpf_jit_binary_unlock_ro(hdr);
 		bpf_jit_binary_free(hdr);
 
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
-- 
cgit v1.2.3


From 2bd1298ac17777525a41c8425521f569e412df14 Mon Sep 17 00:00:00 2001
From: Lokesh Vutla <lokeshvutla@ti.com>
Date: Tue, 30 Apr 2019 15:42:22 +0530
Subject: genirq: Introduce irq_chip_{request,release}_resource_parent() apis

Introduce irq_chip_{request,release}_resource_parent() apis so
that these can be used in hierarchical irqchips.

Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/chip.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 51128bea3846..29d6c7d070b4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1459,6 +1459,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
 	return -ENOSYS;
 }
 EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
+
+/**
+ * irq_chip_request_resources_parent - Request resources on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ */
+int irq_chip_request_resources_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+
+	if (data->chip->irq_request_resources)
+		return data->chip->irq_request_resources(data);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
+
+/**
+ * irq_chip_release_resources_parent - Release resources on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_release_resources_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	if (data->chip->irq_release_resources)
+		data->chip->irq_release_resources(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
 #endif
 
 /**
-- 
cgit v1.2.3


From 524845ff9c473d315468fa3b54054a7e6b2d95cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 15 Apr 2019 22:31:29 -0400
Subject: bpf: switch to ->free_inode()

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4a8f390a2b82..bc53e5b20ddc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -566,9 +566,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
-static void bpf_destroy_inode_deferred(struct rcu_head *head)
+static void bpf_free_inode(struct inode *inode)
 {
-	struct inode *inode = container_of(head, struct inode, i_rcu);
 	enum bpf_type type;
 
 	if (S_ISLNK(inode->i_mode))
@@ -578,16 +577,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head)
 	free_inode_nonrcu(inode);
 }
 
-static void bpf_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
-}
-
 static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
-	.destroy_inode	= bpf_destroy_inode,
+	.free_inode	= bpf_free_inode,
 };
 
 enum {
-- 
cgit v1.2.3


From 98587c2d894c34c9af5cd84ca169e1cd493aa692 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 30 Apr 2019 12:33:45 +0200
Subject: s390: simplify disabled_wait

The disabled_wait() function uses its argument as the PSW address when
it stops the CPU with a wait PSW that is disabled for interrupts.
The different callers sometimes use a specific number like 0xdeadbeef
to indicate a specific failure, the early boot code uses 0 and some
other calls sites use __builtin_return_address(0).

At the time a dump is created the current PSW and the registers of a
CPU are written to lowcore to make them avaiable to the dump analysis
tool. For a CPU stopped with disabled_wait the PSW and the registers
do not really make sense together, the PSW address does not point to
the function the registers belong to.

Simplify disabled_wait() by using _THIS_IP_ for the PSW address and
drop the argument to the function.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/panic.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 0ae0d7332f12..c1fcaad337b7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,12 +318,7 @@ void panic(const char *fmt, ...)
 	}
 #endif
 #if defined(CONFIG_S390)
-	{
-		unsigned long caller;
-
-		caller = (unsigned long)__builtin_return_address(0);
-		disabled_wait(caller);
-	}
+	disabled_wait();
 #endif
 	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
 	local_irq_enable();
-- 
cgit v1.2.3


From a5d5092c9285f6c8937b56f9c6ff2b22d818fc25 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 26 Feb 2019 13:16:14 -0600
Subject: gdbstub: mark expected switch fall-throughs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation to enabling -Wimplicit-fallthrough, mark switch
cases where we are expecting to fall through.

This patch fixes the following warnings:

kernel/debug/gdbstub.c: In function ‘gdb_serial_stub’:
kernel/debug/gdbstub.c:1031:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
    if (remcom_in_buffer[1] == '\0') {
       ^
kernel/debug/gdbstub.c:1036:3: note: here
   case 'C': /* Exception passing */
   ^~~~
kernel/debug/gdbstub.c:1040:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
    if (tmp == 0)
       ^
kernel/debug/gdbstub.c:1043:3: note: here
   case 'c': /* Continue packet */
   ^~~~
kernel/debug/gdbstub.c:1050:4: warning: this statement may fall through [-Wimplicit-fallthrough=]
    dbg_activate_sw_breakpoints();
    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/debug/gdbstub.c:1052:3: note: here
   default:
   ^~~~~~~

Warning level 3 was used: -Wimplicit-fallthrough=3

Notice that, in this particular case, the code comment is modified
in accordance with what GCC is expecting to find.

This patch is part of the ongoing efforts to enable
-Wimplicit-fallthrough.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/gdbstub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 7510dc687c0d..9f267b8905b4 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
 				return DBG_PASS_EVENT;
 			}
 #endif
+			/* Fall through */
 		case 'C': /* Exception passing */
 			tmp = gdb_cmd_exception_pass(ks);
 			if (tmp > 0)
 				goto default_handle;
 			if (tmp == 0)
 				break;
-			/* Fall through on tmp < 0 */
+			/* Fall through - on tmp < 0 */
 		case 'c': /* Continue packet */
 		case 's': /* Single step packet */
 			if (kgdb_contthread && kgdb_contthread != current) {
@@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
 				break;
 			}
 			dbg_activate_sw_breakpoints();
-			/* Fall through to default processing */
+			/* Fall through - to default processing */
 		default:
 default_handle:
 			error = kgdb_arch_handle_exception(ks->ex_vector,
-- 
cgit v1.2.3


From 4cc168eaf3b67d76547fb420c22abe22a3c86003 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 22 Apr 2019 11:33:42 -0500
Subject: gdbstub: Replace strcpy() by strscpy()

The strcpy() function is being deprecated. Replace it by the safer
strscpy() and fix the following Coverity warning:

"You might overrun the 1024-character fixed-size string remcom_in_buffer
by copying cmd without checking the length."

Addresses-Coverity-ID: 138999 ("Copy into fixed size buffer")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/gdbstub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 9f267b8905b4..4b280fc7dd67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1095,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
 		return error;
 	case 's':
 	case 'c':
-		strcpy(remcom_in_buffer, cmd);
+		strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
 		return 0;
 	case '$':
-		strcpy(remcom_in_buffer, cmd);
+		strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
 		gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
 		gdbstub_prev_in_buf_pos = 0;
 		return 0;
-- 
cgit v1.2.3


From 9b555c4d784c468b4167eef9ab621b5203e4f479 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 22 Apr 2019 11:21:06 -0500
Subject: kdb: kdb_support: replace strcpy() by strscpy()

The strcpy() function is being deprecated. Replace it by the safer
strscpy() and fix the following Coverity warning:

"You might overrun the 129-character fixed-size string ks_namebuf
by copying name without checking the length."

Addresses-Coverity-ID: 138995 ("Copy into fixed size buffer")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_support.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 50bf9b119bad..b8e6306e7e13 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
 
 	while ((name = kdb_walk_kallsyms(&pos))) {
 		if (strncmp(name, prefix_name, prefix_len) == 0) {
-			strcpy(ks_namebuf, name);
+			strscpy(ks_namebuf, name, sizeof(ks_namebuf));
 			/* Work out the longest name that matches the prefix */
 			if (++number == 1) {
 				prev_len = min_t(int, max_len-1,
-- 
cgit v1.2.3


From dbfe67334a1767bcb7be8b50bd237b22b272ef23 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 19 Mar 2019 10:12:04 -0700
Subject: tracing: kdb: The skip_lines parameter should have been skip_entries

The things skipped by kdb's "ftdump" command when you pass it a
parameter has always been entries, not lines.  The difference usually
doesn't matter but when the trace buffer has multi-line entries (like
a stack dump) it can matter.

Let's fix this both in the help text for ftdump and also in the local
variable names.

Link: http://lkml.kernel.org/r/20190319171206.97107-1-dianders@chromium.org

Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kdb.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 810d78a8d14c..4b666643d69f 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,7 +17,7 @@
 #include "trace.h"
 #include "trace_output.h"
 
-static void ftrace_dump_buf(int skip_lines, long cpu_file)
+static void ftrace_dump_buf(int skip_entries, long cpu_file)
 {
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
@@ -70,11 +70,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 			kdb_printf("---------------------------------\n");
 		cnt++;
 
-		if (!skip_lines) {
+		if (!skip_entries) {
 			print_trace_line(&iter);
 			trace_printk_seq(&iter.seq);
 		} else {
-			skip_lines--;
+			skip_entries--;
 		}
 
 		if (KDB_FLAG(CMD_INTERRUPT))
@@ -106,7 +106,7 @@ out:
  */
 static int kdb_ftdump(int argc, const char **argv)
 {
-	int skip_lines = 0;
+	int skip_entries = 0;
 	long cpu_file;
 	char *cp;
 
@@ -114,9 +114,9 @@ static int kdb_ftdump(int argc, const char **argv)
 		return KDB_ARGCOUNT;
 
 	if (argc) {
-		skip_lines = simple_strtol(argv[1], &cp, 0);
+		skip_entries = simple_strtol(argv[1], &cp, 0);
 		if (*cp)
-			skip_lines = 0;
+			skip_entries = 0;
 	}
 
 	if (argc == 2) {
@@ -129,7 +129,7 @@ static int kdb_ftdump(int argc, const char **argv)
 	}
 
 	kdb_trap_printk++;
-	ftrace_dump_buf(skip_lines, cpu_file);
+	ftrace_dump_buf(skip_entries, cpu_file);
 	kdb_trap_printk--;
 
 	return 0;
@@ -137,7 +137,7 @@ static int kdb_ftdump(int argc, const char **argv)
 
 static __init int kdb_ftrace_register(void)
 {
-	kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+	kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
 			    "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
 	return 0;
 }
-- 
cgit v1.2.3


From ecffc8a8c7301f6f3c731ba23e38cd049a046416 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 19 Mar 2019 10:12:05 -0700
Subject: tracing: Add trace_total_entries() / trace_total_entries_cpu()

These two new exported functions will be used in a future patch by
kdb_ftdump() to quickly skip all but the last few trace entries.

Link: http://lkml.kernel.org/r/20190319171206.97107-2-dianders@chromium.org

Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 65 ++++++++++++++++++++++++++++++++++++++++------------
 kernel/trace/trace.h |  3 +++
 2 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2bc18de7f0dc..dcb9adb44be9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3492,34 +3492,69 @@ static void s_stop(struct seq_file *m, void *p)
 	trace_event_read_unlock();
 }
 
+static void
+get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+		      unsigned long *entries, int cpu)
+{
+	unsigned long count;
+
+	count = ring_buffer_entries_cpu(buf->buffer, cpu);
+	/*
+	 * If this buffer has skipped entries, then we hold all
+	 * entries for the trace and we need to ignore the
+	 * ones before the time stamp.
+	 */
+	if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+		count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
+		/* total is the same as the entries */
+		*total = count;
+	} else
+		*total = count +
+			ring_buffer_overrun_cpu(buf->buffer, cpu);
+	*entries = count;
+}
+
 static void
 get_total_entries(struct trace_buffer *buf,
 		  unsigned long *total, unsigned long *entries)
 {
-	unsigned long count;
+	unsigned long t, e;
 	int cpu;
 
 	*total = 0;
 	*entries = 0;
 
 	for_each_tracing_cpu(cpu) {
-		count = ring_buffer_entries_cpu(buf->buffer, cpu);
-		/*
-		 * If this buffer has skipped entries, then we hold all
-		 * entries for the trace and we need to ignore the
-		 * ones before the time stamp.
-		 */
-		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
-			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
-			/* total is the same as the entries */
-			*total += count;
-		} else
-			*total += count +
-				ring_buffer_overrun_cpu(buf->buffer, cpu);
-		*entries += count;
+		get_total_entries_cpu(buf, &t, &e, cpu);
+		*total += t;
+		*entries += e;
 	}
 }
 
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
+{
+	unsigned long total, entries;
+
+	if (!tr)
+		tr = &global_trace;
+
+	get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+
+	return entries;
+}
+
+unsigned long trace_total_entries(struct trace_array *tr)
+{
+	unsigned long total, entries;
+
+	if (!tr)
+		tr = &global_trace;
+
+	get_total_entries(&tr->trace_buffer, &total, &entries);
+
+	return entries;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index da00a3d508c1..33f14b9e78b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -721,6 +721,9 @@ void trace_init_global_iter(struct trace_iterator *iter);
 
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
 
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
+unsigned long trace_total_entries(struct trace_array *tr);
+
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
-- 
cgit v1.2.3


From 03197fc02b356606355d7ede343b18e3e3737771 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 19 Mar 2019 10:12:06 -0700
Subject: tracing: kdb: Allow ftdump to skip all but the last few entries

The 'ftdump' command in kdb is currently a bit of a last resort, at
least if you have lots of traces turned on.  It's going to print a
whole boatload of data out your serial port which is probably running
at 115200.  This could easily take many, many minutes.

Usually you're most interested in what's at the _end_ of the ftrace
buffer, AKA what happened most recently.  That means you've got to
wait the full time for the dump.  The 'ftdump' command does attempt to
help you a little bit by allowing you to skip a fixed number of
entries.  Unfortunately it provides no way for you to know how many
entries you should skip.

Let's do similar to python and allow you to use a negative number to
indicate that you want to skip all entries except the last few.  This
allows you to quickly see what you want.

Note that we also change the printout in ftdump to print the
(positive) number of entries actually skipped since that could be
helpful to know when you've specified a negative skip count.

Link: http://lkml.kernel.org/r/20190319171206.97107-3-dianders@chromium.org

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kdb.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 4b666643d69f..6c1ae6b752d1 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,29 +17,25 @@
 #include "trace.h"
 #include "trace_output.h"
 
+static struct trace_iterator iter;
+static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+
 static void ftrace_dump_buf(int skip_entries, long cpu_file)
 {
-	/* use static because iter can be a bit big for the stack */
-	static struct trace_iterator iter;
-	static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
 	struct trace_array *tr;
 	unsigned int old_userobj;
 	int cnt = 0, cpu;
 
-	trace_init_global_iter(&iter);
-	iter.buffer_iter = buffer_iter;
 	tr = iter.tr;
 
-	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-	}
-
 	old_userobj = tr->trace_flags;
 
 	/* don't look at user memory in panic mode */
 	tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
 	kdb_printf("Dumping ftrace buffer:\n");
+	if (skip_entries)
+		kdb_printf("(skipping %d entries)\n", skip_entries);
 
 	/* reset all but tr, trace, and overruns */
 	memset(&iter.seq, 0,
@@ -89,10 +85,6 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 out:
 	tr->trace_flags = old_userobj;
 
-	for_each_tracing_cpu(cpu) {
-		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-	}
-
 	for_each_tracing_cpu(cpu) {
 		if (iter.buffer_iter[cpu]) {
 			ring_buffer_read_finish(iter.buffer_iter[cpu]);
@@ -109,6 +101,8 @@ static int kdb_ftdump(int argc, const char **argv)
 	int skip_entries = 0;
 	long cpu_file;
 	char *cp;
+	int cnt;
+	int cpu;
 
 	if (argc > 2)
 		return KDB_ARGCOUNT;
@@ -129,7 +123,29 @@ static int kdb_ftdump(int argc, const char **argv)
 	}
 
 	kdb_trap_printk++;
+
+	trace_init_global_iter(&iter);
+	iter.buffer_iter = buffer_iter;
+
+	for_each_tracing_cpu(cpu) {
+		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+	}
+
+	/* A negative skip_entries means skip all but the last entries */
+	if (skip_entries < 0) {
+		if (cpu_file == RING_BUFFER_ALL_CPUS)
+			cnt = trace_total_entries(NULL);
+		else
+			cnt = trace_total_entries_cpu(NULL, cpu_file);
+		skip_entries = max(cnt + skip_entries, 0);
+	}
+
 	ftrace_dump_buf(skip_entries, cpu_file);
+
+	for_each_tracing_cpu(cpu) {
+		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+	}
+
 	kdb_trap_printk--;
 
 	return 0;
@@ -138,7 +154,8 @@ static int kdb_ftdump(int argc, const char **argv)
 static __init int kdb_ftrace_register(void)
 {
 	kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
-			    "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
+			    "Dump ftrace log; -skip dumps last #entries", 0,
+			    KDB_ENABLE_ALWAYS_SAFE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 77a5352ba977d2554643e3797e10823d0d03dcf7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 11 Apr 2019 13:34:44 +1000
Subject: sched/core: Allow the remote scheduler tick to be started on CPU0

This has no effect yet because CPU0 will always be a housekeeping CPU
until a later change.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20190411033448.20842-2-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index de8ab411826c..cef22c5499a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5866,7 +5866,7 @@ void __init sched_init_smp(void)
 
 static int __init migration_init(void)
 {
-	sched_rq_cpu_starting(smp_processor_id());
+	sched_cpu_starting(smp_processor_id());
 	return 0;
 }
 early_initcall(migration_init);
-- 
cgit v1.2.3


From aaebdf8d68479f78d9f72b239684f70fbb0722c6 Mon Sep 17 00:00:00 2001
From: Julien Grall <julien.grall@arm.com>
Date: Wed, 1 May 2019 14:58:18 +0100
Subject: genirq/msi: Add a new field in msi_desc to store an IOMMU cookie

When an MSI doorbell is located downstream of an IOMMU, it is required
to swizzle the physical address with an appropriately-mapped IOVA for any
device attached to one of our DMA ops domain.

At the moment, the allocation of the mapping may be done when composing
the message. However, the composing may be done in non-preemtible
context while the allocation requires to be called from preemptible
context.

A follow-up change will split the current logic in two functions
requiring to keep an IOMMU cookie per MSI.

A new field is introduced in msi_desc to store an IOMMU cookie. As the
cookie may not be required in some configuration, the field is protected
under a new config CONFIG_IRQ_MSI_IOMMU.

A pair of helpers has also been introduced to access the field.

Signed-off-by: Julien Grall <julien.grall@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5f3e2baefca9..8fee06625c37 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -91,6 +91,9 @@ config GENERIC_MSI_IRQ_DOMAIN
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_MSI_IRQ
 
+config IRQ_MSI_IOMMU
+	bool
+
 config HANDLE_DOMAIN_IRQ
 	bool
 
-- 
cgit v1.2.3


From 2f1a6fbbef7781382850c3104ecb658f21b5d460 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 11 Apr 2019 13:34:45 +1000
Subject: power/suspend: Add function to disable secondaries for suspend

This adds a function to disable secondary CPUs for suspend that are
not necessarily non-zero / non-boot CPUs. Platforms will be able to
use this to suspend using non-zero CPUs.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20190411033448.20842-3-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kexec_core.c      |  4 ++--
 kernel/power/hibernate.c | 12 ++++++------
 kernel/power/suspend.c   |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d7140447be75..fd5c95ff9251 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
 		error = dpm_suspend_end(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
-		error = disable_nonboot_cpus();
+		error = suspend_disable_secondary_cpus();
 		if (error)
 			goto Enable_cpus;
 		local_irq_disable();
@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
  Enable_irqs:
 		local_irq_enable();
  Enable_cpus:
-		enable_nonboot_cpus();
+		suspend_enable_secondary_cpus();
 		dpm_resume_start(PMSG_RESTORE);
  Resume_devices:
 		dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index abef759de7c8..cfc7a57049e4 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -281,7 +281,7 @@ static int create_image(int platform_mode)
 	if (error || hibernation_test(TEST_PLATFORM))
 		goto Platform_finish;
 
-	error = disable_nonboot_cpus();
+	error = suspend_disable_secondary_cpus();
 	if (error || hibernation_test(TEST_CPUS))
 		goto Enable_cpus;
 
@@ -323,7 +323,7 @@ static int create_image(int platform_mode)
 	local_irq_enable();
 
  Enable_cpus:
-	enable_nonboot_cpus();
+	suspend_enable_secondary_cpus();
 
  Platform_finish:
 	platform_finish(platform_mode);
@@ -417,7 +417,7 @@ int hibernation_snapshot(int platform_mode)
 
 int __weak hibernate_resume_nonboot_cpu_disable(void)
 {
-	return disable_nonboot_cpus();
+	return suspend_disable_secondary_cpus();
 }
 
 /**
@@ -486,7 +486,7 @@ static int resume_target_kernel(bool platform_mode)
 	local_irq_enable();
 
  Enable_cpus:
-	enable_nonboot_cpus();
+	suspend_enable_secondary_cpus();
 
  Cleanup:
 	platform_restore_cleanup(platform_mode);
@@ -564,7 +564,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Platform_finish;
 
-	error = disable_nonboot_cpus();
+	error = suspend_disable_secondary_cpus();
 	if (error)
 		goto Enable_cpus;
 
@@ -586,7 +586,7 @@ int hibernation_platform_enter(void)
 	local_irq_enable();
 
  Enable_cpus:
-	enable_nonboot_cpus();
+	suspend_enable_secondary_cpus();
 
  Platform_finish:
 	hibernation_ops->finish();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0bd595a0b610..59b6def23046 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -428,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	if (suspend_test(TEST_PLATFORM))
 		goto Platform_wake;
 
-	error = disable_nonboot_cpus();
+	error = suspend_disable_secondary_cpus();
 	if (error || suspend_test(TEST_CPUS))
 		goto Enable_cpus;
 
@@ -458,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	BUG_ON(irqs_disabled());
 
  Enable_cpus:
-	enable_nonboot_cpus();
+	suspend_enable_secondary_cpus();
 
  Platform_wake:
 	platform_resume_noirq(state);
-- 
cgit v1.2.3


From 9ca12ac04bb7d7cfb28aa549dcd3d15761f15543 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 11 Apr 2019 13:34:46 +1000
Subject: kernel/cpu: Allow non-zero CPU to be primary for suspend / kexec
 freeze

This patch provides an arch option, ARCH_SUSPEND_NONZERO_CPU, to
opt-in to allowing suspend to occur on one of the housekeeping CPUs
rather than hardcoded CPU0.

This will allow CPU0 to be a nohz_full CPU with a later change.

It may be possible for platforms with hardware/firmware restrictions
on suspend/wake effectively support this by handing off the final
stage to CPU0 when kernel housekeeping is no longer required. Another
option is to make housekeeping / nohz_full mask dynamic at runtime,
but the complexity could not be justified at this time.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20190411033448.20842-4-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c         | 10 +++++++++-
 kernel/power/Kconfig |  9 +++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6754f3ecfd94..d1bf6e2b4752 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
 #include <linux/sched/task.h>
 #include <linux/sched/smt.h>
 #include <linux/unistd.h>
@@ -1199,8 +1200,15 @@ int freeze_secondary_cpus(int primary)
 	int cpu, error = 0;
 
 	cpu_maps_update_begin();
-	if (!cpu_online(primary))
+	if (primary == -1) {
 		primary = cpumask_first(cpu_online_mask);
+		if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
+			primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+	} else {
+		if (!cpu_online(primary))
+			primary = cpumask_first(cpu_online_mask);
+	}
+
 	/*
 	 * We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f8fe57d1022e..9bbaaab14b36 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
 	depends on PM_SLEEP
 	select HOTPLUG_CPU
 
+config PM_SLEEP_SMP_NONZERO_CPU
+	def_bool y
+	depends on PM_SLEEP_SMP
+	depends on ARCH_SUSPEND_NONZERO_CPU
+	---help---
+	If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+	non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+	will allow nohz_full mask to include CPU0.
+
 config PM_AUTOSLEEP
 	bool "Opportunistic sleep"
 	depends on PM_SLEEP
-- 
cgit v1.2.3


From 9219565aa89033a9cfdae788c1940473a1253d6c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 11 Apr 2019 13:34:47 +1000
Subject: sched/isolation: Require a present CPU in housekeeping mask

During housekeeping mask setup, currently a possible CPU is required.
That does not guarantee the CPU would be available at boot time, so
check to ensure that at least one present CPU is in the mask.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20190411033448.20842-5-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/isolation.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b02d148e7672..687302051a27 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
 static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask;
+	cpumask_var_t tmp;
 	int err;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 		return 0;
 	}
 
+	alloc_bootmem_cpumask_var(&tmp);
 	if (!housekeeping_flags) {
 		alloc_bootmem_cpumask_var(&housekeeping_mask);
 		cpumask_andnot(housekeeping_mask,
 			       cpu_possible_mask, non_housekeeping_mask);
-		if (cpumask_empty(housekeeping_mask))
+
+		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+		if (cpumask_empty(tmp)) {
+			pr_warn("Housekeeping: must include one present CPU, "
+				"using boot CPU:%d\n", smp_processor_id());
 			__cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+		}
 	} else {
-		cpumask_var_t tmp;
-
-		alloc_bootmem_cpumask_var(&tmp);
+		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+		if (cpumask_empty(tmp))
+			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
 		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
 		if (!cpumask_equal(tmp, housekeeping_mask)) {
 			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 			free_bootmem_cpumask_var(non_housekeeping_mask);
 			return 0;
 		}
-		free_bootmem_cpumask_var(tmp);
 	}
+	free_bootmem_cpumask_var(tmp);
 
 	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
 		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
-- 
cgit v1.2.3


From 08ae95f4fd3b38b257f5dc7e6507e071c27ba0d5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 11 Apr 2019 13:34:48 +1000
Subject: nohz_full: Allow the boot CPU to be nohz_full

Allow the boot CPU/CPU0 to be nohz_full. Have the boot CPU take the
do_timer duty during boot until a housekeeping CPU can take over.

This is supported when CONFIG_PM_SLEEP_SMP is not configured, or when
it is configured and the arch allows suspend on non-zero CPUs.

nohz_full has been trialed at a large supercomputer site and found to
significantly reduce jitter. In order to deploy it in production, they
need CPU0 to be nohz_full because their job control system requires
the application CPUs to start from 0, and the housekeeping CPUs are
placed higher. An equivalent job scheduling that uses CPU0 for
housekeeping could be achieved by modifying their system, but it is
preferable if nohz_full can support their environment without
modification.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20190411033448.20842-6-npiggin@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-common.c | 50 +++++++++++++++++++++++++++++++++++++++++++----
 kernel/time/tick-sched.c  | 34 ++++++++++++++++++++++----------
 2 files changed, 70 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df401463a191..e49e8091f9ac 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
  *    procedure also covers cpu hotplug.
  */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
 
 /*
  * Debugging: see timer_list.c
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 	}
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+	int cpu = *(unsigned int *)info;
+
+	WARN_ON(tick_do_timer_cpu != smp_processor_id());
+
+	tick_do_timer_cpu = cpu;
+}
+
+static void tick_take_do_timer_from_boot(void)
+{
+	int cpu = smp_processor_id();
+	int from = tick_do_timer_boot_cpu;
+
+	if (from >= 0 && from != cpu)
+		smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
+
 /*
  * Setup the tick device
  */
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
 		 * this cpu:
 		 */
 		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-			if (!tick_nohz_full_cpu(cpu))
-				tick_do_timer_cpu = cpu;
-			else
-				tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+			tick_do_timer_cpu = cpu;
+
 			tick_next_period = ktime_get();
 			tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+			/*
+			 * The boot CPU may be nohz_full, in which case set
+			 * tick_do_timer_boot_cpu so the first housekeeping
+			 * secondary that comes up will take do_timer from
+			 * us.
+			 */
+			if (tick_nohz_full_cpu(cpu))
+				tick_do_timer_boot_cpu = cpu;
+
+		} else if (tick_do_timer_boot_cpu != -1 &&
+						!tick_nohz_full_cpu(cpu)) {
+			tick_take_do_timer_from_boot();
+			tick_do_timer_boot_cpu = -1;
+			WARN_ON(tick_do_timer_cpu != cpu);
+#endif
 		}
 
 		/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..4aa917acbe1c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	 * into a long sleep. If two CPUs happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
 	 * jiffies_lock.
+	 *
+	 * If nohz_full is enabled, this should not happen because the
+	 * tick_do_timer_cpu never relinquishes.
 	 */
-	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
-	    && !tick_nohz_full_cpu(cpu))
+	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+#ifdef CONFIG_NO_HZ_FULL
+		WARN_ON(tick_nohz_full_running);
+#endif
 		tick_do_timer_cpu = cpu;
+	}
 #endif
 
 	/* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
 	/*
-	 * The boot CPU handles housekeeping duty (unbound timers,
-	 * workqueues, timekeeping, ...) on behalf of full dynticks
+	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
 	 * CPUs. It must remain online when nohz full is enabled.
 	 */
 	if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
 		return;
 	}
 
-	cpu = smp_processor_id();
+	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
+			!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
+		cpu = smp_processor_id();
 
-	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-		pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
-			cpu);
-		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+		if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+			pr_warn("NO_HZ: Clearing %d from nohz_full range "
+				"for timekeeping\n", cpu);
+			cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+		}
 	}
 
 	for_each_cpu(cpu, tick_nohz_full_mask)
@@ -904,8 +913,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		/*
 		 * Boot safety: make sure the timekeeping duty has been
 		 * assigned before entering dyntick-idle mode,
+		 * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
 		 */
-		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+		if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+			return false;
+
+		/* Should not happen for nohz-full */
+		if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
 			return false;
 	}
 
-- 
cgit v1.2.3


From 4d141ab3416d90f87775f5dee725efdf40110a8f Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 3 May 2019 15:26:24 +0200
Subject: livepatch: Remove custom kobject state handling

kobject_init() always succeeds and sets the reference count to 1.
It allows to always free the structures via kobject_put() and
the related release callback.

Note that the custom kobject state handling was used only
because we did not know that kobject_put() can and actually
should get called even when kobject_init_and_add() fails.

The patch should not change the existing behavior.

Suggested-by: "Tobin C. Harding" <tobin@kernel.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 56 +++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 14f33ab6c583..42385f23252a 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -426,6 +426,9 @@ static void klp_free_object_dynamic(struct klp_object *obj)
 	kfree(obj);
 }
 
+static struct kobj_type klp_ktype_object;
+static struct kobj_type klp_ktype_func;
+
 static struct klp_object *klp_alloc_object_dynamic(const char *name)
 {
 	struct klp_object *obj;
@@ -443,6 +446,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)
 	}
 
 	INIT_LIST_HEAD(&obj->func_list);
+	kobject_init(&obj->kobj, &klp_ktype_object);
 	obj->dynamic = true;
 
 	return obj;
@@ -471,6 +475,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
 		}
 	}
 
+	kobject_init(&func->kobj, &klp_ktype_func);
 	/*
 	 * func->new_func is same as func->old_func. These addresses are
 	 * set when the object is loaded, see klp_init_object_loaded().
@@ -588,13 +593,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
 			continue;
 
 		list_del(&func->node);
-
-		/* Might be called from klp_init_patch() error path. */
-		if (func->kobj_added) {
-			kobject_put(&func->kobj);
-		} else if (func->nop) {
-			klp_free_func_nop(func);
-		}
+		kobject_put(&func->kobj);
 	}
 }
 
@@ -624,13 +623,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
 			continue;
 
 		list_del(&obj->node);
-
-		/* Might be called from klp_init_patch() error path. */
-		if (obj->kobj_added) {
-			kobject_put(&obj->kobj);
-		} else if (obj->dynamic) {
-			klp_free_object_dynamic(obj);
-		}
+		kobject_put(&obj->kobj);
 	}
 }
 
@@ -675,10 +668,8 @@ static void klp_free_patch_finish(struct klp_patch *patch)
 	 * this is called when the patch gets disabled and it
 	 * cannot get enabled again.
 	 */
-	if (patch->kobj_added) {
-		kobject_put(&patch->kobj);
-		wait_for_completion(&patch->finish);
-	}
+	kobject_put(&patch->kobj);
+	wait_for_completion(&patch->finish);
 
 	/* Put the module after the last access to struct klp_patch. */
 	if (!patch->forced)
@@ -700,8 +691,6 @@ static void klp_free_patch_work_fn(struct work_struct *work)
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
-	int ret;
-
 	if (!func->old_name)
 		return -EINVAL;
 
@@ -724,13 +713,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 	 * object. If the user selects 0 for old_sympos, then 1 will be used
 	 * since a unique symbol will be the first occurrence.
 	 */
-	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				   &obj->kobj, "%s,%lu", func->old_name,
-				   func->old_sympos ? func->old_sympos : 1);
-	if (!ret)
-		func->kobj_added = true;
-
-	return ret;
+	return kobject_add(&func->kobj, &obj->kobj, "%s,%lu",
+			   func->old_name,
+			   func->old_sympos ? func->old_sympos : 1);
 }
 
 /* Arches may override this to finish any remaining arch-specific tasks */
@@ -801,11 +786,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	klp_find_object_module(obj);
 
 	name = klp_is_module(obj) ? obj->name : "vmlinux";
-	ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
-				   &patch->kobj, "%s", name);
+	ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);
 	if (ret)
 		return ret;
-	obj->kobj_added = true;
 
 	klp_for_each_func(obj, func) {
 		ret = klp_init_func(obj, func);
@@ -829,7 +812,7 @@ static int klp_init_patch_early(struct klp_patch *patch)
 
 	INIT_LIST_HEAD(&patch->list);
 	INIT_LIST_HEAD(&patch->obj_list);
-	patch->kobj_added = false;
+	kobject_init(&patch->kobj, &klp_ktype_patch);
 	patch->enabled = false;
 	patch->forced = false;
 	INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
@@ -840,11 +823,11 @@ static int klp_init_patch_early(struct klp_patch *patch)
 			return -EINVAL;
 
 		INIT_LIST_HEAD(&obj->func_list);
-		obj->kobj_added = false;
+		kobject_init(&obj->kobj, &klp_ktype_object);
 		list_add_tail(&obj->node, &patch->obj_list);
 
 		klp_for_each_func_static(obj, func) {
-			func->kobj_added = false;
+			kobject_init(&func->kobj, &klp_ktype_func);
 			list_add_tail(&func->node, &obj->func_list);
 		}
 	}
@@ -860,11 +843,9 @@ static int klp_init_patch(struct klp_patch *patch)
 	struct klp_object *obj;
 	int ret;
 
-	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
-				   klp_root_kobj, "%s", patch->mod->name);
+	ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);
 	if (ret)
 		return ret;
-	patch->kobj_added = true;
 
 	if (patch->replace) {
 		ret = klp_add_nops(patch);
@@ -926,9 +907,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	if (WARN_ON(patch->enabled))
 		return -EINVAL;
 
-	if (!patch->kobj_added)
-		return -EINVAL;
-
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
 	klp_init_transition(patch, KLP_PATCHED);
-- 
cgit v1.2.3


From f68d67cf2f83dc82675969724b59ca7c6da43fa9 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 3 May 2019 15:26:25 +0200
Subject: livepatch: Remove duplicated code for early initialization

kobject_init() call added one more operation that has to be
done when doing the early initialization of both static and
dynamic livepatch structures.

It would have been easier when the early initialization code
was not duplicated. Let's deduplicate it for future generations
of livepatching hackers.

The patch does not change the existing behavior.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 42385f23252a..f12c0eabd843 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -426,10 +426,13 @@ static void klp_free_object_dynamic(struct klp_object *obj)
 	kfree(obj);
 }
 
-static struct kobj_type klp_ktype_object;
-static struct kobj_type klp_ktype_func;
+static void klp_init_func_early(struct klp_object *obj,
+				struct klp_func *func);
+static void klp_init_object_early(struct klp_patch *patch,
+				  struct klp_object *obj);
 
-static struct klp_object *klp_alloc_object_dynamic(const char *name)
+static struct klp_object *klp_alloc_object_dynamic(const char *name,
+						   struct klp_patch *patch)
 {
 	struct klp_object *obj;
 
@@ -445,8 +448,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)
 		}
 	}
 
-	INIT_LIST_HEAD(&obj->func_list);
-	kobject_init(&obj->kobj, &klp_ktype_object);
+	klp_init_object_early(patch, obj);
 	obj->dynamic = true;
 
 	return obj;
@@ -475,7 +477,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
 		}
 	}
 
-	kobject_init(&func->kobj, &klp_ktype_func);
+	klp_init_func_early(obj, func);
 	/*
 	 * func->new_func is same as func->old_func. These addresses are
 	 * set when the object is loaded, see klp_init_object_loaded().
@@ -495,11 +497,9 @@ static int klp_add_object_nops(struct klp_patch *patch,
 	obj = klp_find_object(patch, old_obj);
 
 	if (!obj) {
-		obj = klp_alloc_object_dynamic(old_obj->name);
+		obj = klp_alloc_object_dynamic(old_obj->name, patch);
 		if (!obj)
 			return -ENOMEM;
-
-		list_add_tail(&obj->node, &patch->obj_list);
 	}
 
 	klp_for_each_func(old_obj, old_func) {
@@ -510,8 +510,6 @@ static int klp_add_object_nops(struct klp_patch *patch,
 		func = klp_alloc_func_nop(old_func, obj);
 		if (!func)
 			return -ENOMEM;
-
-		list_add_tail(&func->node, &obj->func_list);
 	}
 
 	return 0;
@@ -802,6 +800,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	return ret;
 }
 
+static void klp_init_func_early(struct klp_object *obj,
+				struct klp_func *func)
+{
+	kobject_init(&func->kobj, &klp_ktype_func);
+	list_add_tail(&func->node, &obj->func_list);
+}
+
+static void klp_init_object_early(struct klp_patch *patch,
+				  struct klp_object *obj)
+{
+	INIT_LIST_HEAD(&obj->func_list);
+	kobject_init(&obj->kobj, &klp_ktype_object);
+	list_add_tail(&obj->node, &patch->obj_list);
+}
+
 static int klp_init_patch_early(struct klp_patch *patch)
 {
 	struct klp_object *obj;
@@ -822,13 +835,10 @@ static int klp_init_patch_early(struct klp_patch *patch)
 		if (!obj->funcs)
 			return -EINVAL;
 
-		INIT_LIST_HEAD(&obj->func_list);
-		kobject_init(&obj->kobj, &klp_ktype_object);
-		list_add_tail(&obj->node, &patch->obj_list);
+		klp_init_object_early(patch, obj);
 
 		klp_for_each_func_static(obj, func) {
-			kobject_init(&func->kobj, &klp_ktype_func);
-			list_add_tail(&func->node, &obj->func_list);
+			klp_init_func_early(obj, func);
 		}
 	}
 
-- 
cgit v1.2.3


From 13bf5ced93775ffccb53527a9d862e023a9daa03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Mar 2019 15:44:06 +0100
Subject: dma-mapping: add a Kconfig symbol to indicate arch_dma_prep_coherent
 presence

Add a Kconfig symbol that indicates an architecture provides a
arch_dma_prep_coherent implementation, and provide a stub otherwise.

This will allow the generic dma-iommu code to use it while still
allowing to be built for cache coherent architectures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 52b704e2b97a..83d711f8d665 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -38,6 +38,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
 config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 	bool
 
+config ARCH_HAS_DMA_PREP_COHERENT
+	bool
+
 config ARCH_HAS_DMA_COHERENT_TO_PFN
 	bool
 
-- 
cgit v1.2.3


From 533307dc20a9e84a0687d4ca24aeb669516c0243 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Tue, 30 Apr 2019 17:57:29 +0800
Subject: cgroup: Remove unused cgrp variable

The 'cgrp' is set but not used in commit <76f969e8948d8>
("cgroup: cgroup v2 freezer").
Remove it to avoid [-Wunused-but-set-variable] warning.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57edcf398d71..4fe9f7f1a3fa 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5864,11 +5864,8 @@ void cgroup_post_fork(struct task_struct *child)
 		 * the task into the frozen state.
 		 */
 		if (unlikely(cgroup_task_freeze(child))) {
-			struct cgroup *cgrp;
-
 			spin_lock(&child->sighand->siglock);
 			WARN_ON_ONCE(child->frozen);
-			cgrp = cset->dfl_cgrp;
 			child->jobctl |= JOBCTL_TRAP_FREEZE;
 			spin_unlock(&child->sighand->siglock);
 
-- 
cgit v1.2.3


From cb2c4cd87874a7975b7b8615866b3a87bae10aab Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 26 Apr 2019 10:59:44 -0700
Subject: cgroup: prevent spurious transition into non-frozen state

If freezing of a cgroup races with waking of a task from
the frozen state (like waiting in vfork() or in do_signal_stop()),
a spurious transition of the cgroup state can happen.

The task enters cgroup_leave_frozen(true), the cgroup->nr_frozen_tasks
counter decrements, and the cgroup is switched to the unfrozen state.

To prevent it, let's reserve cgroup_leave_frozen(true) for
terminating processes and use cgroup_leave_frozen(false) otherwise.

To avoid busy-looping in the signal handling loop waiting
for JOBCTL_TRAP_FREEZE set from the cgroup freezing path,
let's do it explicitly in cgroup_leave_frozen(), if the task
is going to stay frozen.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/freezer.c | 16 +++++-----------
 kernel/signal.c         |  2 +-
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3bfbb3c8baf3..c321e768f8d3 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -139,19 +139,13 @@ void cgroup_leave_frozen(bool always_leave)
 		cgroup_update_frozen(cgrp);
 		WARN_ON_ONCE(!current->frozen);
 		current->frozen = false;
+	} else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) {
+		spin_lock(&current->sighand->siglock);
+		current->jobctl |= JOBCTL_TRAP_FREEZE;
+		set_thread_flag(TIF_SIGPENDING);
+		spin_unlock(&current->sighand->siglock);
 	}
 	spin_unlock_irq(&css_set_lock);
-
-	if (unlikely(current->frozen)) {
-		/*
-		 * If the task remained in the frozen state,
-		 * make sure it won't reach userspace without
-		 * entering the signal handling loop.
-		 */
-		spin_lock_irq(&current->sighand->siglock);
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-	}
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 095e0fc57b25..16b72f4f14df 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2514,7 +2514,7 @@ relock:
 		 */
 		if (unlikely(cgroup_task_frozen(current))) {
 			spin_unlock_irq(&sighand->siglock);
-			cgroup_leave_frozen(true);
+			cgroup_leave_frozen(false);
 			goto relock;
 		}
 
-- 
cgit v1.2.3


From 96b9c592def5d7203bdad1337d9c92a2183de5cb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 26 Apr 2019 10:59:45 -0700
Subject: cgroup: get rid of cgroup_freezer_frozen_exit()

A task should never enter the exit path with the task->frozen bit set.
Any frozen task must enter the signal handling loop and the only
way to escape is through cgroup_leave_frozen(true), which
unconditionally drops the task->frozen bit. So it means that
cgroyp_freezer_frozen_exit() has zero chances to be called and
has to be removed.

Let's put a WARN_ON_ONCE() instead of the cgroup_freezer_frozen_exit()
call to catch any potential leak of the task's frozen bit.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c  |  5 ++---
 kernel/cgroup/freezer.c | 10 ----------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4fe9f7f1a3fa..327f37c9fdfa 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5926,9 +5926,8 @@ void cgroup_exit(struct task_struct *tsk)
 		css_set_move_task(tsk, cset, NULL, false);
 		cset->nr_tasks--;
 
-		if (unlikely(cgroup_task_frozen(tsk)))
-			cgroup_freezer_frozen_exit(tsk);
-		else if (unlikely(cgroup_task_freeze(tsk)))
+		WARN_ON_ONCE(cgroup_task_frozen(tsk));
+		if (unlikely(cgroup_task_freeze(tsk)))
 			cgroup_update_frozen(task_dfl_cgroup(tsk));
 
 		spin_unlock_irq(&css_set_lock);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index c321e768f8d3..8cf010680678 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -248,16 +248,6 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
 	cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
 }
 
-void cgroup_freezer_frozen_exit(struct task_struct *task)
-{
-	struct cgroup *cgrp = task_dfl_cgroup(task);
-
-	lockdep_assert_held(&css_set_lock);
-
-	cgroup_dec_frozen_cnt(cgrp);
-	cgroup_update_frozen(cgrp);
-}
-
 void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 {
 	struct cgroup_subsys_state *css;
-- 
cgit v1.2.3


From 1900da520c9fdc3a7cf00d21638f7c8721d5ac7f Mon Sep 17 00:00:00 2001
From: Fuqian Huang <huangfq.daxian@gmail.com>
Date: Sun, 21 Apr 2019 19:47:27 +0800
Subject: kernel: cgroup: fix misuse of %x

Pointers should be printed with %p or %px rather than
cast to unsigned long type and printed with %lx.
Change %lx to %p to print the pointers.

Signed-off-by: Fuqian Huang <huangfq.daxian@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/debug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f1b87330bee..80aa3f027ac3 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 		css = cset->subsys[ss->id];
 		if (!css)
 			continue;
-		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
-			  (unsigned long)css, css->id);
+		seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
+			  css, css->id);
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
@@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 		if (css->parent)
 			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
 				 css->parent->id);
-		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
-			  (unsigned long)css, css->id,
+		seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name,
+			  css, css->id,
 			  atomic_read(&css->online_cnt), pbuf);
 	}
 
-- 
cgit v1.2.3


From a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sun, 28 Apr 2019 17:25:38 -0400
Subject: locking/rwsem: Prevent decrement of reader count before increment

During my rwsem testing, it was found that after a down_read(), the
reader count may occasionally become 0 or even negative. Consequently,
a writer may steal the lock at that time and execute with the reader
in parallel thus breaking the mutual exclusion guarantee of the write
lock. In other words, both readers and writer can become rwsem owners
simultaneously.

The current reader wakeup code does it in one pass to clear waiter->task
and put them into wake_q before fully incrementing the reader count.
Once waiter->task is cleared, the corresponding reader may see it,
finish the critical section and do unlock to decrement the count before
the count is incremented. This is not a problem if there is only one
reader to wake up as the count has been pre-incremented by 1.  It is
a problem if there are more than one readers to be woken up and writer
can steal the lock.

The wakeup was actually done in 2 passes before the following v4.9 commit:

  70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")

To fix this problem, the wakeup is now done in two passes
again. In the first pass, we collect the readers and count them.
The reader count is then fully incremented. In the second pass, the
waiter->task is then cleared and they are put into wake_q to be woken
up later.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once")
Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 46 ++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 6b3ee9948bf1..0b1f77957240 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 {
 	struct rwsem_waiter *waiter, *tmp;
 	long oldcount, woken = 0, adjustment = 0;
+	struct list_head wlist;
 
 	/*
 	 * Take a peek at the queue head waiter such that we can determine
@@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 	 * of the queue. We know that woken will be at least 1 as we accounted
 	 * for above. Note we increment the 'active part' of the count by the
 	 * number of readers before waking any processes up.
+	 *
+	 * We have to do wakeup in 2 passes to prevent the possibility that
+	 * the reader count may be decremented before it is incremented. It
+	 * is because the to-be-woken waiter may not have slept yet. So it
+	 * may see waiter->task got cleared, finish its critical section and
+	 * do an unlock before the reader count increment.
+	 *
+	 * 1) Collect the read-waiters in a separate list, count them and
+	 *    fully increment the reader count in rwsem.
+	 * 2) For each waiters in the new list, clear waiter->task and
+	 *    put them into wake_q to be woken up later.
 	 */
-	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
-		struct task_struct *tsk;
-
+	list_for_each_entry(waiter, &sem->wait_list, list) {
 		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			break;
 
 		woken++;
-		tsk = waiter->task;
+	}
+	list_cut_before(&wlist, &sem->wait_list, &waiter->list);
+
+	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+	lockevent_cond_inc(rwsem_wake_reader, woken);
+	if (list_empty(&sem->wait_list)) {
+		/* hit end of list above */
+		adjustment -= RWSEM_WAITING_BIAS;
+	}
+
+	if (adjustment)
+		atomic_long_add(adjustment, &sem->count);
+
+	/* 2nd pass */
+	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+		struct task_struct *tsk;
 
+		tsk = waiter->task;
 		get_task_struct(tsk);
-		list_del(&waiter->list);
+
 		/*
 		 * Ensure calling get_task_struct() before setting the reader
 		 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 		 */
 		wake_q_add_safe(wake_q, tsk);
 	}
-
-	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-	lockevent_cond_inc(rwsem_wake_reader, woken);
-	if (list_empty(&sem->wait_list)) {
-		/* hit end of list above */
-		adjustment -= RWSEM_WAITING_BIAS;
-	}
-
-	if (adjustment)
-		atomic_long_add(adjustment, &sem->count);
 }
 
 /*
-- 
cgit v1.2.3


From b3e5838252665ee4cfa76b82bdf1198dca81e5be Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Wed, 27 Mar 2019 13:04:15 +0100
Subject: clone: add CLONE_PIDFD

This patchset makes it possible to retrieve pid file descriptors at
process creation time by introducing the new flag CLONE_PIDFD to the
clone() system call.  Linus originally suggested to implement this as a
new flag to clone() instead of making it a separate system call.  As
spotted by Linus, there is exactly one bit for clone() left.

CLONE_PIDFD creates file descriptors based on the anonymous inode
implementation in the kernel that will also be used to implement the new
mount api.  They serve as a simple opaque handle on pids.  Logically,
this makes it possible to interpret a pidfd differently, narrowing or
widening the scope of various operations (e.g. signal sending).  Thus, a
pidfd cannot just refer to a tgid, but also a tid, or in theory - given
appropriate flag arguments in relevant syscalls - a process group or
session. A pidfd does not represent a privilege.  This does not imply it
cannot ever be that way but for now this is not the case.

A pidfd comes with additional information in fdinfo if the kernel supports
procfs.  The fdinfo file contains the pid of the process in the callers
pid namespace in the same format as the procfs status file, i.e. "Pid:\t%d".

As suggested by Oleg, with CLONE_PIDFD the pidfd is returned in the
parent_tidptr argument of clone.  This has the advantage that we can
give back the associated pid and the pidfd at the same time.

To remove worries about missing metadata access this patchset comes with
a sample program that illustrates how a combination of CLONE_PIDFD, and
pidfd_send_signal() can be used to gain race-free access to process
metadata through /proc/<pid>.  The sample program can easily be
translated into a helper that would be suitable for inclusion in libc so
that users don't have to worry about writing it themselves.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <christian@brauner.io>
Co-developed-by: Jann Horn <jannh@google.com>
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/fork.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 103 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..e45f0acaf451 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,6 +11,7 @@
  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  */
 
+#include <linux/anon_inodes.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
@@ -1662,6 +1664,58 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 }
 
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+	struct pid *pid = file->private_data;
+
+	file->private_data = NULL;
+	put_pid(pid);
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+	struct pid *pid = f->private_data;
+
+	seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+	seq_putc(m, '\n');
+}
+#endif
+
+const struct file_operations pidfd_fops = {
+	.release = pidfd_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+	int fd;
+
+	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+			      O_RDWR | O_CLOEXEC);
+	if (fd < 0)
+		put_pid(pid);
+
+	return fd;
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1674,13 +1728,14 @@ static __latent_entropy struct task_struct *copy_process(
 					unsigned long clone_flags,
 					unsigned long stack_start,
 					unsigned long stack_size,
+					int __user *parent_tidptr,
 					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
 					unsigned long tls,
 					int node)
 {
-	int retval;
+	int pidfd = -1, retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
 
@@ -1730,6 +1785,31 @@ static __latent_entropy struct task_struct *copy_process(
 			return ERR_PTR(-EINVAL);
 	}
 
+	if (clone_flags & CLONE_PIDFD) {
+		int reserved;
+
+		/*
+		 * - CLONE_PARENT_SETTID is useless for pidfds and also
+		 *   parent_tidptr is used to return pidfds.
+		 * - CLONE_DETACHED is blocked so that we can potentially
+		 *   reuse it later for CLONE_PIDFD.
+		 * - CLONE_THREAD is blocked until someone really needs it.
+		 */
+		if (clone_flags &
+		    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * Verify that parent_tidptr is sane so we can potentially
+		 * reuse it later.
+		 */
+		if (get_user(reserved, parent_tidptr))
+			return ERR_PTR(-EFAULT);
+
+		if (reserved != 0)
+			return ERR_PTR(-EINVAL);
+	}
+
 	/*
 	 * Force any signals received before this point to be delivered
 	 * before the fork happens.  Collect up signals sent to multiple
@@ -1936,6 +2016,22 @@ static __latent_entropy struct task_struct *copy_process(
 		}
 	}
 
+	/*
+	 * This has to happen after we've potentially unshared the file
+	 * descriptor table (so that the pidfd doesn't leak into the child
+	 * if the fd table isn't shared).
+	 */
+	if (clone_flags & CLONE_PIDFD) {
+		retval = pidfd_create(pid);
+		if (retval < 0)
+			goto bad_fork_free_pid;
+
+		pidfd = retval;
+		retval = put_user(pidfd, parent_tidptr);
+		if (retval)
+			goto bad_fork_put_pidfd;
+	}
+
 #ifdef CONFIG_BLOCK
 	p->plug = NULL;
 #endif
@@ -1996,7 +2092,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p);
 	if (retval)
-		goto bad_fork_free_pid;
+		goto bad_fork_put_pidfd;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2111,6 +2207,9 @@ bad_fork_cancel_cgroup:
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p);
+bad_fork_put_pidfd:
+	if (clone_flags & CLONE_PIDFD)
+		ksys_close(pidfd);
 bad_fork_free_pid:
 	cgroup_threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
@@ -2176,7 +2275,7 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+	task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
 			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
 		init_idle_pids(task);
@@ -2223,7 +2322,7 @@ long _do_fork(unsigned long clone_flags,
 			trace = 0;
 	}
 
-	p = copy_process(clone_flags, stack_start, stack_size,
+	p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
 			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
 	add_latent_entropy();
 
-- 
cgit v1.2.3


From 2151ad1b067275730de1b38c7257478cae47d29e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Wed, 17 Apr 2019 22:50:25 +0200
Subject: signal: support CLONE_PIDFD with pidfd_send_signal

Let pidfd_send_signal() use pidfds retrieved via CLONE_PIDFD.  With this
patch pidfd_send_signal() becomes independent of procfs.  This fullfils
the request made when we merged the pidfd_send_signal() patchset.  The
pidfd_send_signal() syscall is now always available allowing for it to
be used by users without procfs mounted or even users without procfs
support compiled into the kernel.

Signed-off-by: Christian Brauner <christian@brauner.io>
Co-developed-by: Jann Horn <jannh@google.com>
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 12 +++++++++---
 kernel/sys_ni.c |  3 ---
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f98448cf2def..1581140f2d99 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-#ifdef CONFIG_PROC_FS
 /*
  * Verify that the signaler and signalee either are in the same pid namespace
  * or that the signaler's pid namespace is an ancestor of the signalee's pid
@@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
 	return copy_siginfo_from_user(kinfo, info);
 }
 
+static struct pid *pidfd_to_pid(const struct file *file)
+{
+	if (file->f_op == &pidfd_fops)
+		return file->private_data;
+
+	return tgid_pidfd_to_pid(file);
+}
+
 /**
  * sys_pidfd_send_signal - send a signal to a process through a task file
  *                          descriptor
@@ -3586,7 +3593,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		return -EBADF;
 
 	/* Is this a pidfd? */
-	pid = tgid_pidfd_to_pid(f.file);
+	pid = pidfd_to_pid(f.file);
 	if (IS_ERR(pid)) {
 		ret = PTR_ERR(pid);
 		goto err;
@@ -3620,7 +3627,6 @@ err:
 	fdput(f);
 	return ret;
 }
-#endif /* CONFIG_PROC_FS */
 
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..4d9ae5ea6caf 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog);
 
 /* kernel/sched/core.c */
 
-/* kernel/signal.c */
-COND_SYSCALL(pidfd_send_signal);
-
 /* kernel/sys.c */
 COND_SYSCALL(setregid);
 COND_SYSCALL(setgid);
-- 
cgit v1.2.3


From 4dd537aca25dd2e0e8aca8b8923930cbe6240003 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 7 May 2019 22:55:41 +0900
Subject: tracing: uprobes: Re-enable $comm support for uprobe events

Since commit 533059281ee5 ("tracing: probeevent: Introduce new
argument fetching code") dropped the $comm support from uprobe
events, this re-enables it.

For $comm support, uses strlcpy() instead of strncpy_from_user()
to copy current task's comm. Because it is in the kernel space,
strncpy_from_user() always fails to copy the comm.
This also uses strlen() instead of strnlen_user() to measure the
length of the comm.

Note that this uses -ECOMM as a token value to fetch the comm
string. If the user-space pointer points -ECOMM, it will be
translated to task->comm.

Link: http://lkml.kernel.org/r/155723734162.9149.4042756162201097965.stgit@devnote2

Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code")
Reported-by: Andreas Ziegler <andreas.ziegler@fau.de>
Acked-by: Andreas Ziegler <andreas.ziegler@fau.de>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe.h  |  1 +
 kernel/trace/trace_uprobe.c | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b7737666c1a8..f9a8c632188b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -124,6 +124,7 @@ struct fetch_insn {
 
 /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
 #define FETCH_INSN_MAX	16
+#define FETCH_TOKEN_COMM	(-ECOMM)
 
 /* Fetch type information table */
 struct fetch_type {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index cd8750a72768..eb7e06b54741 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
 	if (unlikely(!maxlen))
 		return -ENOMEM;
 
-	ret = strncpy_from_user(dst, src, maxlen);
+	if (addr == FETCH_TOKEN_COMM)
+		ret = strlcpy(dst, current->comm, maxlen);
+	else
+		ret = strncpy_from_user(dst, src, maxlen);
 	if (ret >= 0) {
 		if (ret == maxlen)
 			dst[ret - 1] = '\0';
@@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr)
 	int len;
 	void __user *vaddr = (void __force __user *) addr;
 
-	len = strnlen_user(vaddr, MAX_STRING_SIZE);
+	if (addr == FETCH_TOKEN_COMM)
+		len = strlen(current->comm) + 1;
+	else
+		len = strnlen_user(vaddr, MAX_STRING_SIZE);
 
 	return (len > MAX_STRING_SIZE) ? 0 : len;
 }
@@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
 	case FETCH_OP_IMM:
 		val = code->immediate;
 		break;
+	case FETCH_OP_COMM:
+		val = FETCH_TOKEN_COMM;
+		break;
 	case FETCH_OP_FOFFS:
 		val = translate_user_vaddr(code->immediate);
 		break;
-- 
cgit v1.2.3


From 489fe0096b19b664b8f3bed0fd604d617a229b5a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 7 May 2019 22:55:52 +0900
Subject: tracing: probeevent: Do not accumulate on ret variable

Do not accumulate strlen result on "ret" local variable, because
it is accumulated on "total" local variable for array case.

Link: http://lkml.kernel.org/r/155723735237.9149.3192150444705457531.stgit@devnote2

Fixes: 40b53b771806 ("tracing: probeevent: Add array type support")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 4737bb8c07a3..c30c61f12ddd 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -88,7 +88,7 @@ stage3:
 	/* 3rd stage: store value to buffer */
 	if (unlikely(!dest)) {
 		if (code->op == FETCH_OP_ST_STRING) {
-			ret += fetch_store_strlen(val + code->offset);
+			ret = fetch_store_strlen(val + code->offset);
 			code++;
 			goto array;
 		} else
-- 
cgit v1.2.3


From 3dd1f7f24f8ceec00bbbc364c2ac3c893f0fdc4c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 7 May 2019 22:56:02 +0900
Subject: tracing: probeevent: Fix to make the type of $comm string

Fix to make the type of $comm "string".  If we set the other type to $comm
argument, it shows meaningless value or wrong data. Currently probe events
allow us to set string array type (e.g. ":string[2]"), or other digit types
like x8 on $comm. But since clearly $comm is just a string data, it should
not be fetched by other types including array.

Link: http://lkml.kernel.org/r/155723736241.9149.14582064184468574539.stgit@devnote2

Cc: Andreas Ziegler <andreas.ziegler@fau.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4cc2d467d34c..e0d1d5353464 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -533,13 +533,14 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 			}
 		}
 	}
-	/*
-	 * The default type of $comm should be "string", and it can't be
-	 * dereferenced.
-	 */
-	if (!t && strcmp(arg, "$comm") == 0)
+
+	/* Since $comm can not be dereferred, we can find $comm by strcmp */
+	if (strcmp(arg, "$comm") == 0) {
+		/* The type of $comm must be "string", and not an array. */
+		if (parg->count || (t && strcmp(t, "string")))
+			return -EINVAL;
 		parg->type = find_fetch_type("string");
-	else
+	} else
 		parg->type = find_fetch_type(t);
 	if (!parg->type) {
 		trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
-- 
cgit v1.2.3


From 5c173bedb24dfcc8e412d3c3f111c504e4408dd5 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Thu, 1 Nov 2018 11:46:40 -0400
Subject: ring-buffer: Fix mispelling of Calculate

It's not "Caculate".

Link: http://lkml.kernel.org/r/20181101154640.23162-1-tiny.windzz@gmail.com

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index ffba6789c0e2..0564f6db0561 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -362,7 +362,7 @@ static void ring_buffer_producer(void)
 			hit--; /* make it non zero */
 		}
 
-		/* Caculate the average time in nanosecs */
+		/* Calculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / (hit + missed);
 		trace_printk("%ld ns per entry\n", avg);
 	}
-- 
cgit v1.2.3


From 0f5e5a3ab7fa1c09370a4d709ad6157457d5b8b6 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 20 Mar 2019 09:17:57 +0100
Subject: tracing: Eliminate const char[] auto variables

Automatic const char[] variables cause unnecessary code
generation. For example, the this_mod variable leads to

    3f04:       48 b8 5f 5f 74 68 69 73 5f 6d   movabs $0x6d5f736968745f5f,%rax # __this_m
    3f0e:       4c 8d 44 24 02                  lea    0x2(%rsp),%r8
    3f13:       48 8d 7c 24 10                  lea    0x10(%rsp),%rdi
    3f18:       48 89 44 24 02                  mov    %rax,0x2(%rsp)
    3f1d:       4c 89 e9                        mov    %r13,%rcx
    3f20:       b8 65 00 00 00                  mov    $0x65,%eax # e
    3f25:       48 c7 c2 00 00 00 00            mov    $0x0,%rdx
                        3f28: R_X86_64_32S      .rodata.str1.1+0x18d
    3f2c:       be 48 00 00 00                  mov    $0x48,%esi
    3f31:       c7 44 24 0a 6f 64 75 6c         movl   $0x6c75646f,0xa(%rsp) # odul
    3f39:       66 89 44 24 0e                  mov    %ax,0xe(%rsp)

i.e., the string gets built on the stack at runtime. Similar code can be
found for the other instances I'm replacing here. Putting the string
in .rodata reduces the combined .text+.rodata size and saves time and
stack space at runtime.

The simplest fix, and what I've done for the this_mod case, is to just
make the variable static.

However, for the "<faulted>" case where the same string is used twice,
that prevents the linker from merging those two literals, so instead use
a macro - that also keeps the two instances automatically in
sync (instead of only the compile-time strlen expression).

Finally, for the two runs of spaces, it turns out that the "build
these strings on the stack" is not the worst part of what gcc does -
it turns print_func_help_header_irq() into "if (tgid) { /*
print_event_info + five seq_printf calls */ } else { /* print
event_info + another five seq_printf */}". Taking inspiration from a
suggestion from Al Viro, use %.*s to make snprintf either stop after
the first two spaces or print the whole string. As a bonus, the
seq_printfs now fit on single lines (at least, they are not longer
than the existing ones in the function just above), making it easier
to see that the ascii art lines up.

x86-64 defconfig + CONFIG_FUNCTION_TRACER:

$ scripts/stackdelta /tmp/stackusage.{0,1}
./kernel/trace/ftrace.c ftrace_mod_callback     152     136     -16
./kernel/trace/trace.c  trace_default_header    56      32      -24
./kernel/trace/trace.c  tracing_mark_raw_write  96      72      -24
./kernel/trace/trace.c  tracing_mark_write      104     80      -24

bloat-o-meter

add/remove: 1/0 grow/shrink: 0/4 up/down: 14/-375 (-361)
Function                                     old     new   delta
this_mod                                       -      14     +14
ftrace_mod_callback                          577     542     -35
tracing_mark_raw_write                       444     374     -70
tracing_mark_write                           616     540     -76
trace_default_header                         600     406    -194

Link: http://lkml.kernel.org/r/20190320081757.6037-1-linux@rasmusvillemoes.dk

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c |  2 +-
 kernel/trace/trace.c  | 34 +++++++++++++---------------------
 2 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 433a64f49532..7765a53f1006 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3875,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
 static bool module_exists(const char *module)
 {
 	/* All modules have the symbol __this_module */
-	const char this_mod[] = "__this_module";
+	static const char this_mod[] = "__this_module";
 	char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
 	unsigned long val;
 	int n;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dcb9adb44be9..3259019cc66d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3593,25 +3593,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
 				       unsigned int flags)
 {
 	bool tgid = flags & TRACE_ITER_RECORD_TGID;
-	const char tgid_space[] = "          ";
-	const char space[] = "  ";
+	const char *space = "          ";
+	int prec = tgid ? 10 : 2;
 
 	print_event_info(buf, m);
 
-	seq_printf(m, "#                          %s  _-----=> irqs-off\n",
-		   tgid ? tgid_space : space);
-	seq_printf(m, "#                          %s / _----=> need-resched\n",
-		   tgid ? tgid_space : space);
-	seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",
-		   tgid ? tgid_space : space);
-	seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",
-		   tgid ? tgid_space : space);
-	seq_printf(m, "#                          %s||| /     delay\n",
-		   tgid ? tgid_space : space);
-	seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n",
-		   tgid ? "   TGID   " : space);
-	seq_printf(m, "#              | |   %s  |   ||||       |         |\n",
-		   tgid ? "     |    " : space);
+	seq_printf(m, "#                          %.*s  _-----=> irqs-off\n", prec, space);
+	seq_printf(m, "#                          %.*s / _----=> need-resched\n", prec, space);
+	seq_printf(m, "#                          %.*s| / _---=> hardirq/softirq\n", prec, space);
+	seq_printf(m, "#                          %.*s|| / _--=> preempt-depth\n", prec, space);
+	seq_printf(m, "#                          %.*s||| /     delay\n", prec, space);
+	seq_printf(m, "#           TASK-PID %.*sCPU#  ||||    TIMESTAMP  FUNCTION\n", prec, "   TGID   ");
+	seq_printf(m, "#              | |   %.*s  |   ||||       |         |\n", prec, "     |    ");
 }
 
 void
@@ -6342,13 +6335,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer *buffer;
 	struct print_entry *entry;
 	unsigned long irq_flags;
-	const char faulted[] = "<faulted>";
 	ssize_t written;
 	int size;
 	int len;
 
 /* Used in tracing_mark_raw_write() as well */
-#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+#define FAULTED_STR "<faulted>"
+#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -6380,7 +6373,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
 	if (len) {
-		memcpy(&entry->buf, faulted, FAULTED_SIZE);
+		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
 		cnt = FAULTED_SIZE;
 		written = -EFAULT;
 	} else
@@ -6421,7 +6414,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct raw_data_entry *entry;
-	const char faulted[] = "<faulted>";
 	unsigned long irq_flags;
 	ssize_t written;
 	int size;
@@ -6461,7 +6453,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
 	if (len) {
 		entry->id = -1;
-		memcpy(&entry->buf, faulted, FAULTED_SIZE);
+		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
 		written = -EFAULT;
 	} else
 		written = cnt;
-- 
cgit v1.2.3


From bfcd631eb6de474d8e097fd0f9f840fdf7272a1d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 9 Nov 2018 13:23:12 +0000
Subject: tracing: Fix white space issues in parse_pred() function

Trivial fix to clean up an indentation issue, a whole chunk of code
has an extra space in the indentation.

Link: http://lkml.kernel.org/r/20181109132312.20994-1-colin.king@canonical.com

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 180ecb390baa..d3e59312ef40 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1222,30 +1222,30 @@ static int parse_pred(const char *str, void *data,
 		 * (perf doesn't use it) and grab everything.
 		 */
 		if (strcmp(field->name, "ip") != 0) {
-			 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
-			 goto err_free;
-		 }
-		 pred->fn = filter_pred_none;
-
-		 /*
-		  * Quotes are not required, but if they exist then we need
-		  * to read them till we hit a matching one.
-		  */
-		 if (str[i] == '\'' || str[i] == '"')
-			 q = str[i];
-		 else
-			 q = 0;
-
-		 for (i++; str[i]; i++) {
-			 if (q && str[i] == q)
-				 break;
-			 if (!q && (str[i] == ')' || str[i] == '&' ||
-				    str[i] == '|'))
-				 break;
-		 }
-		 /* Skip quotes */
-		 if (q)
-			 s++;
+			parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+			goto err_free;
+		}
+		pred->fn = filter_pred_none;
+
+		/*
+		 * Quotes are not required, but if they exist then we need
+		 * to read them till we hit a matching one.
+		 */
+		if (str[i] == '\'' || str[i] == '"')
+			q = str[i];
+		else
+			q = 0;
+
+		for (i++; str[i]; i++) {
+			if (q && str[i] == q)
+				break;
+			if (!q && (str[i] == ')' || str[i] == '&' ||
+				   str[i] == '|'))
+				break;
+		}
+		/* Skip quotes */
+		if (q)
+			s++;
 		len = i - s;
 		if (len >= MAX_FILTER_STR_VAL) {
 			parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
-- 
cgit v1.2.3


From 6fc2171c5c03672bae71d04a0f5fa88cc9c3b4e2 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 30 Nov 2018 15:56:22 +0100
Subject: tracing: Allow RCU to run between postponed startup tests

When building a allmodconfig kernel for arm64 and boot that in qemu,
CONFIG_FTRACE_STARTUP_TEST gets enabled and that takes time so the
watchdog expires and prints out a message like this:
'watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]'
Depending on what the what test gets called from init_trace_selftests()
it stays minutes in the loop.
Rework so that function cond_resched() gets called in the
init_trace_selftests loop.

Link: http://lkml.kernel.org/r/20181130145622.26334-1-anders.roxell@linaro.org

Co-developed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3259019cc66d..4269af5905e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1722,6 +1722,10 @@ static __init int init_trace_selftests(void)
 	pr_info("Running postponed tracer tests:\n");
 
 	list_for_each_entry_safe(p, n, &postponed_selftests, list) {
+		/* This loop can take minutes when sanitizers are enabled, so
+		 * lets make sure we allow RCU processing.
+		 */
+		cond_resched();
 		ret = run_tracer_selftest(p->type);
 		/* If the test fails, then warn and remove from available_tracers */
 		if (ret < 0) {
-- 
cgit v1.2.3


From cbe08bcbbe787315c425dde284dcb715cfbf3f39 Mon Sep 17 00:00:00 2001
From: Elazar Leibovich <elazar@lightbitslabs.com>
Date: Mon, 31 Dec 2018 13:58:37 +0200
Subject: tracing: Fix partial reading of trace event's id file

When reading only part of the id file, the ppos isn't tracked correctly.
This is taken care by simple_read_from_buffer.

Reading a single byte, and then the next byte would result EOF.

While this seems like not a big deal, this breaks abstractions that
reads information from files unbuffered. See for example
https://github.com/golang/go/issues/29399

This code was mentioned as problematic in
commit cd458ba9d5a5
("tracing: Do not (ab)use trace_seq in event_id_read()")

An example C code that show this bug is:

  #include <stdio.h>
  #include <stdint.h>

  #include <sys/types.h>
  #include <sys/stat.h>
  #include <fcntl.h>
  #include <unistd.h>

  int main(int argc, char **argv) {
    if (argc < 2)
      return 1;
    int fd = open(argv[1], O_RDONLY);
    char c;
    read(fd, &c, 1);
    printf("First  %c\n", c);
    read(fd, &c, 1);
    printf("Second %c\n", c);
  }

Then run with, e.g.

  sudo ./a.out /sys/kernel/debug/tracing/events/tcp/tcp_set_state/id

You'll notice you're getting the first character twice, instead of the
first two characters in the id file.

Link: http://lkml.kernel.org/r/20181231115837.4932-1-elazar@lightbitslabs.com

Cc: Orit Wasserman <orit.was@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 23725aeeab10b ("ftrace: provide an id file for each event")
Signed-off-by: Elazar Leibovich <elazar@lightbitslabs.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 81c038ed6cee..0ce3db67f556 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1319,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	char buf[32];
 	int len;
 
-	if (*ppos)
-		return 0;
-
 	if (unlikely(!id))
 		return -ENODEV;
 
-- 
cgit v1.2.3


From 8623b00676f16ed8972008095deca2c8e2b97a37 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 14 Jan 2019 22:34:08 -0600
Subject: tracing: Replace kzalloc with kcalloc

Replace kzalloc() function with its 2-factor argument form, kcalloc().

This patch replaces cases of:

	kzalloc(a * b, gfp)

with:
	kcalloc(a, b, gfp)

This code was detected with the help of Coccinelle.

Link: http://lkml.kernel.org/r/20190115043408.GA23456@embeddedor

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d1d5353464..a347faced959 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -558,7 +558,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 			 parg->count);
 	}
 
-	code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL);
+	code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
 	if (!code)
 		return -ENOMEM;
 	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
@@ -637,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	code->op = FETCH_OP_END;
 
 	/* Shrink down the code buffer */
-	parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL);
+	parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
 	if (!parg->code)
 		ret = -ENOMEM;
 	else
-- 
cgit v1.2.3


From b9416997603ef7e17d4de10b6408f19da2feb72c Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat (VMware)" <srivatsa@csail.mit.edu>
Date: Mon, 28 Jan 2019 17:55:53 -0800
Subject: tracing: Fix documentation about disabling options using
 trace_options

To disable a tracing option using the trace_options file, the option
name needs to be prefixed with 'no', and not suffixed, as the README
states. Fix it.

Link: http://lkml.kernel.org/r/154872690031.47356.5739053380942044586.stgit@srivatsa-ubuntu

Signed-off-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4269af5905e4..a3a6945a7732 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4755,7 +4755,7 @@ static const char readme_msg[] =
 	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
 	"\t\t\t  Remove sub-buffer with rmdir\n"
 	"  trace_options\t\t- Set format or modify how tracing happens\n"
-	"\t\t\t  Disable an option by adding a suffix 'no' to the\n"
+	"\t\t\t  Disable an option by prefixing 'no' to the\n"
 	"\t\t\t  option name\n"
 	"  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
 #ifdef CONFIG_DYNAMIC_FTRACE
-- 
cgit v1.2.3


From fdc6bae940ee9eb869e493990540098b8c0fd6ab Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Wed, 17 Apr 2019 10:48:33 +0200
Subject: ntp: Allow TAI-UTC offset to be set to zero

The ADJ_TAI adjtimex mode sets the TAI-UTC offset of the system clock.
It is typically set by NTP/PTP implementations and it is automatically
updated by the kernel on leap seconds. The initial value is zero (which
applications may interpret as unknown), but this value cannot be set by
adjtimex. This limitation seems to go back to the original "nanokernel"
implementation by David Mills.

Change the ADJ_TAI check to accept zero as a valid TAI-UTC offset in
order to allow setting it back to the initial value.

Fixes: 153b5d054ac2 ("ntp: support for TAI")
Suggested-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: https://lkml.kernel.org/r/20190417084833.7401-1-mlichvar@redhat.com
---
 kernel/time/ntp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 92a90014a925..f43d47c8c3b6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -690,7 +690,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
 		time_constant = max(time_constant, 0l);
 	}
 
-	if (txc->modes & ADJ_TAI && txc->constant > 0)
+	if (txc->modes & ADJ_TAI && txc->constant >= 0)
 		*time_tai = txc->constant;
 
 	if (txc->modes & ADJ_OFFSET)
-- 
cgit v1.2.3


From f2b31bb598248c04721cb8485e6091a9feb045ac Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 8 May 2019 13:34:20 -0700
Subject: cgroup: never call do_group_exit() with task->frozen bit set

I've got two independent reports that cgroup_task_frozen() check
in cgroup_exit() has been triggered by lkp libhugetlbfs-test and
LTP ptrace01 tests.

For example:
[   44.576072] WARNING: CPU: 1 PID: 3028 at kernel/cgroup/cgroup.c:5932 cgroup_exit+0x148/0x160
[   44.577724] Modules linked in: crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel sr_mod cdrom
bochs_drm sg ttm ata_generic pata_acpi ppdev drm_kms_helper snd_pcm syscopyarea aesni_intel snd_timer
sysfillrect sysimgblt snd crypto_simd cryptd glue_helper soundcore fb_sys_fops joydev drm serio_raw pcspkr
ata_piix libata i2c_piix4 floppy parport_pc parport ip_tables
[   44.583106] CPU: 1 PID: 3028 Comm: ptrace-write-hu Not tainted 5.1.0-rc3-00053-g9262503 #5
[   44.584600] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   44.586116] RIP: 0010:cgroup_exit+0x148/0x160
[   44.587135] Code: 0f 84 50 ff ff ff 48 8b 85 c8 0c 00 00 48 8b 78 70 e8 ec 2e 00 00 e9 3b ff ff ff f0 ff 43 60
0f 88 72 21 89 00 e9 48 ff ff ff <0f> 0b e9 1b ff ff ff e8 3c 73 f4 ff 66 90 66 2e 0f 1f 84 00 00 00
[   44.590113] RSP: 0018:ffffb25702dcfd30 EFLAGS: 00010002
[   44.591167] RAX: ffff96a7fee32410 RBX: ffff96a7ff1d6000 RCX: dead000000000200
[   44.592446] RDX: ffff96a7ff1d6080 RSI: ffff96a7fec75290 RDI: ffff96a7fec75290
[   44.593715] RBP: ffff96a7fec745c0 R08: ffff96a7fec74658 R09: 0000000000000000
[   44.594985] R10: 0000000000000000 R11: 0000000000000001 R12: ffff96a7fec75101
[   44.596266] R13: ffff96a7fec745c0 R14: ffff96a7ff3bde30 R15: ffff96a7fec75130
[   44.597550] FS:  0000000000000000(0000) GS:ffff96a7dd700000(0000) knlGS:0000000000000000
[   44.598950] CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
[   44.600098] CR2: 00000000f7a00000 CR3: 000000000d20e000 CR4: 00000000000406e0
[   44.601417] Call Trace:
[   44.602777]  do_exit+0x337/0xc40
[   44.603677]  do_group_exit+0x3a/0xa0
[   44.604610]  get_signal+0x12e/0x8d0
[   44.605533]  ? __switch_to_asm+0x40/0x70
[   44.606503]  do_signal+0x36/0x650
[   44.607409]  ? __switch_to_asm+0x40/0x70
[   44.608383]  ? __schedule+0x267/0x860
[   44.609329]  exit_to_usermode_loop+0x89/0xf0
[   44.610349]  do_fast_syscall_32+0x251/0x2e3
[   44.611357]  entry_SYSENTER_compat+0x7f/0x91
[   44.612376] ---[ end trace e4ca5cfc4b7f7964 ]---

The problem is caused by the ptrace_signal() call in the for loop
in get_signal(). There is a cgroup_enter_frozen() call inside
ptrace_signal(), so after exit from ptrace_signal() the task->frozen
bit might be set. In this case do_group_exit() can be called with the
task->frozen bit set and trigger the warning. This is only place where
we can leave the loop with the task->frozen bit set and without
setting JOBCTL_TRAP_FREEZE and TIF_SIGPENDING.

To resolve this problem, let's move cgroup_leave_frozen(true) call to
just after the fatal label. If the task is going to die, the frozen
bit must be cleared no matter how we get into this point.

Reported-by: kernel test robot <rong.a.chen@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 16b72f4f14df..8607b11ff936 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2483,10 +2483,6 @@ relock:
 		ksig->info.si_signo = signr = SIGKILL;
 		sigdelset(&current->pending.signal, SIGKILL);
 		recalc_sigpending();
-		current->jobctl &= ~JOBCTL_TRAP_FREEZE;
-		spin_unlock_irq(&sighand->siglock);
-		if (unlikely(cgroup_task_frozen(current)))
-			cgroup_leave_frozen(true);
 		goto fatal;
 	}
 
@@ -2608,8 +2604,10 @@ relock:
 			continue;
 		}
 
-		spin_unlock_irq(&sighand->siglock);
 	fatal:
+		spin_unlock_irq(&sighand->siglock);
+		if (unlikely(cgroup_task_frozen(current)))
+			cgroup_leave_frozen(true);
 
 		/*
 		 * Anything else is fatal, maybe with a core dump.
-- 
cgit v1.2.3


From c3b7112df86b769927a60a6d7175988ca3d60f09 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Fri, 10 May 2019 11:53:46 +0200
Subject: fork: do not release lock that wasn't taken

Avoid calling cgroup_threadgroup_change_end() without having called
cgroup_threadgroup_change_begin() first.

During process creation we need to check whether the cgroup we are in
allows us to fork. To perform this check the cgroup needs to guard itself
against threadgroup changes and takes a lock.
Prior to CLONE_PIDFD the cleanup target "bad_fork_free_pid" would also need
to call cgroup_threadgroup_change_end() because said lock had already been
taken.
However, this is not the case anymore with the addition of CLONE_PIDFD. We
are now allocating a pidfd before we check whether the cgroup we're in can
fork and thus prior to taking the lock. So when copy_process() fails at the
right step it would release a lock we haven't taken.
This bug is not even very subtle to be honest. It's just not very clear
from the naming of cgroup_threadgroup_change_{begin,end}() that a lock is
taken.

Here's the relevant splat:

entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7fec849
Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90
90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90
90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078
RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000
RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
------------[ cut here ]------------
DEBUG_LOCKS_WARN_ON(depth <= 0)
WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052 __lock_release
kernel/locking/lockdep.c:4052 [inline]
WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052
lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 7744 Comm: syz-executor007 Not tainted 5.1.0+ #4
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  panic+0x2cb/0x65c kernel/panic.c:214
  __warn.cold+0x20/0x45 kernel/panic.c:566
  report_bug+0x263/0x2b0 lib/bug.c:186
  fixup_bug arch/x86/kernel/traps.c:179 [inline]
  fixup_bug arch/x86/kernel/traps.c:174 [inline]
  do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
  do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
  invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:972
RIP: 0010:__lock_release kernel/locking/lockdep.c:4052 [inline]
RIP: 0010:lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321
Code: 0f 85 a0 03 00 00 8b 35 77 66 08 08 85 f6 75 23 48 c7 c6 a0 55 6b 87
48 c7 c7 40 25 6b 87 4c 89 85 70 ff ff ff e8 b7 a9 eb ff <0f> 0b 4c 8b 85
70 ff ff ff 4c 89 ea 4c 89 e6 4c 89 c7 e8 52 63 ff
RSP: 0018:ffff888094117b48 EFLAGS: 00010086
RAX: 0000000000000000 RBX: 1ffff11012822f6f RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff815af236 RDI: ffffed1012822f5b
RBP: ffff888094117c00 R08: ffff888092bfc400 R09: fffffbfff113301d
R10: fffffbfff113301c R11: ffffffff889980e3 R12: ffffffff8a451df8
R13: ffffffff8142e71f R14: ffffffff8a44cc80 R15: ffff888094117bd8
  percpu_up_read.constprop.0+0xcb/0x110 include/linux/percpu-rwsem.h:92
  cgroup_threadgroup_change_end include/linux/cgroup-defs.h:712 [inline]
  copy_process.part.0+0x47ff/0x6710 kernel/fork.c:2222
  copy_process kernel/fork.c:1772 [inline]
  _do_fork+0x25d/0xfd0 kernel/fork.c:2338
  __do_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:240 [inline]
  __se_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:236 [inline]
  __ia32_compat_sys_x86_clone+0xbc/0x140 arch/x86/ia32/sys_ia32.c:236
  do_syscall_32_irqs_on arch/x86/entry/common.c:334 [inline]
  do_fast_syscall_32+0x281/0xd54 arch/x86/entry/common.c:405
  entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7fec849
Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90
90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90
90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078
RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000
RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Kernel Offset: disabled
Rebooting in 86400 seconds..

Reported-and-tested-by: syzbot+3286e58549edc479faae@syzkaller.appspotmail.com
Fixes: b3e583825266 ("clone: add CLONE_PIDFD")
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 kernel/fork.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 5359facf9867..737db1828437 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2102,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p);
 	if (retval)
-		goto bad_fork_put_pidfd;
+		goto bad_fork_cgroup_threadgroup_change_end;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2217,11 +2217,12 @@ bad_fork_cancel_cgroup:
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p);
+bad_fork_cgroup_threadgroup_change_end:
+	cgroup_threadgroup_change_end(current);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD)
 		ksys_close(pidfd);
 bad_fork_free_pid:
-	cgroup_threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
-- 
cgit v1.2.3


From 56e33afd7757d5da2664fb797f2544ce167176be Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 10 May 2019 23:47:50 +0200
Subject: livepatch: Remove klp_check_compiler_support()

The only purpose of klp_check_compiler_support() is to make sure that we
are not using ftrace on x86 via mcount (because that's executed only after
prologue has already happened, and that's too late for livepatching
purposes).

Now that mcount is not supported by ftrace any more, there is no need for
klp_check_compiler_support() either.

Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1905102346100.17054@cbobk.fhfr.pm

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/livepatch/core.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb0ee10a1981..112a36ed4a09 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1220,14 +1220,6 @@ void klp_module_going(struct module *mod)
 
 static int __init klp_init(void)
 {
-	int ret;
-
-	ret = klp_check_compiler_support();
-	if (ret) {
-		pr_info("Your compiler is too old; turning off.\n");
-		return -EINVAL;
-	}
-
 	klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
 	if (!klp_root_kobj)
 		return -ENOMEM;
-- 
cgit v1.2.3


From af959b18fd447170a10865283ba691af4353cc7f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 11 May 2019 03:03:09 +0200
Subject: bpf: fix out of bounds backwards jmps due to dead code removal

systemtap folks reported the following splat recently:

  [ 7790.862212] WARNING: CPU: 3 PID: 26759 at arch/x86/kernel/kprobes/core.c:1022 kprobe_fault_handler+0xec/0xf0
  [...]
  [ 7790.864113] CPU: 3 PID: 26759 Comm: sshd Not tainted 5.1.0-0.rc7.git1.1.fc31.x86_64 #1
  [ 7790.864198] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS[...]
  [ 7790.864314] RIP: 0010:kprobe_fault_handler+0xec/0xf0
  [ 7790.864375] Code: 48 8b 50 [...]
  [ 7790.864714] RSP: 0018:ffffc06800bdbb48 EFLAGS: 00010082
  [ 7790.864812] RAX: ffff9e2b75a16320 RBX: 0000000000000000 RCX: 0000000000000000
  [ 7790.865306] RDX: ffffffffffffffff RSI: 000000000000000e RDI: ffffc06800bdbbf8
  [ 7790.865514] RBP: ffffc06800bdbbf8 R08: 0000000000000000 R09: 0000000000000000
  [ 7790.865960] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc06800bdbbf8
  [ 7790.866037] R13: ffff9e2ab56a0418 R14: ffff9e2b6d0bb400 R15: ffff9e2b6d268000
  [ 7790.866114] FS:  00007fde49937d80(0000) GS:ffff9e2b75a00000(0000) knlGS:0000000000000000
  [ 7790.866193] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 7790.866318] CR2: 0000000000000000 CR3: 000000012f312000 CR4: 00000000000006e0
  [ 7790.866419] Call Trace:
  [ 7790.866677]  do_user_addr_fault+0x64/0x480
  [ 7790.867513]  do_page_fault+0x33/0x210
  [ 7790.868002]  async_page_fault+0x1e/0x30
  [ 7790.868071] RIP: 0010:          (null)
  [ 7790.868144] Code: Bad RIP value.
  [ 7790.868229] RSP: 0018:ffffc06800bdbca8 EFLAGS: 00010282
  [ 7790.868362] RAX: ffff9e2b598b60f8 RBX: ffffc06800bdbe48 RCX: 0000000000000004
  [ 7790.868629] RDX: 0000000000000004 RSI: ffffc06800bdbc6c RDI: ffff9e2b598b60f0
  [ 7790.868834] RBP: ffffc06800bdbcf8 R08: 0000000000000000 R09: 0000000000000004
  [ 7790.870432] R10: 00000000ff6f7a03 R11: 0000000000000000 R12: 0000000000000001
  [ 7790.871859] R13: ffffc06800bdbcb8 R14: 0000000000000000 R15: ffff9e2acd0a5310
  [ 7790.873455]  ? vfs_read+0x5/0x170
  [ 7790.874639]  ? vfs_read+0x1/0x170
  [ 7790.875834]  ? trace_call_bpf+0xf6/0x260
  [ 7790.877044]  ? vfs_read+0x1/0x170
  [ 7790.878208]  ? vfs_read+0x5/0x170
  [ 7790.879345]  ? kprobe_perf_func+0x233/0x260
  [ 7790.880503]  ? vfs_read+0x1/0x170
  [ 7790.881632]  ? vfs_read+0x5/0x170
  [ 7790.882751]  ? kprobe_ftrace_handler+0x92/0xf0
  [ 7790.883926]  ? __vfs_read+0x30/0x30
  [ 7790.885050]  ? ftrace_ops_assist_func+0x94/0x100
  [ 7790.886183]  ? vfs_read+0x1/0x170
  [ 7790.887283]  ? vfs_read+0x5/0x170
  [ 7790.888348]  ? ksys_read+0x5a/0xe0
  [ 7790.889389]  ? do_syscall_64+0x5c/0xa0
  [ 7790.890401]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe

After some debugging, turns out that the logic in 2cbd95a5c4fb
("bpf: change parameters of call/branch offset adjustment") has
a bug that is exposed after 52875a04f4b2 ("bpf: verifier: remove
dead code") in that we miss some of the jump offset adjustments
after code patching when we remove dead code, more concretely,
upon backward jump spanning over the area that is being removed.

BPF insns of a case that was hit pre 52875a04f4b2:

  [...]
  676: (85) call bpf_perf_event_output#-47616
  677: (05) goto pc-636
  678: (62) *(u32 *)(r10 -64) = 0
  679: (bf) r7 = r10
  680: (07) r7 += -64
  681: (05) goto pc-44
  682: (05) goto pc-1
  683: (05) goto pc-1

BPF insns afterwards:

  [...]
  618: (85) call bpf_perf_event_output#-47616
  619: (05) goto pc-638
  620: (62) *(u32 *)(r10 -64) = 0
  621: (bf) r7 = r10
  622: (07) r7 += -64
  623: (05) goto pc-44

To illustrate the bug, situation looks as follows:
     ____
  0 |    | <-- foo: [...]
  1 |____|
  2 |____| <-- pos / end_new  ^
  3 |    |                    |
  4 |    |                    |  len
  5 |____|                    |  (remove region)
  6 |    | <-- end_old        v
  7 |    |
  8 |    | <-- curr  (jmp foo)
  9 |____|

The condition curr >= end_new && curr + off + 1 < end_new in the
branch delta adjustments is never hit because curr + off + 1 <
end_new is compared as unsigned and therefore curr + off + 1 >
end_new in unsigned realm as curr + off + 1 becomes negative
since the insns are memmove()'d before the offset adjustments.

Correct BPF insns after this fix:

  [...]
  618: (85) call bpf_perf_event_output#-47216
  619: (05) goto pc-578
  620: (62) *(u32 *)(r10 -64) = 0
  621: (bf) r7 = r10
  622: (07) r7 += -64
  623: (05) goto pc-44

Note that unprivileged case is not affected from this.

Fixes: 52875a04f4b2 ("bpf: verifier: remove dead code")
Fixes: 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment")
Reported-by: Frank Ch. Eigler <fche@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3ba56e73c90e..242a643af82f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -338,7 +338,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
 }
 
 static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
-				s32 end_new, u32 curr, const bool probe_pass)
+				s32 end_new, s32 curr, const bool probe_pass)
 {
 	const s64 imm_min = S32_MIN, imm_max = S32_MAX;
 	s32 delta = end_new - end_old;
@@ -356,7 +356,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
 }
 
 static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
-				s32 end_new, u32 curr, const bool probe_pass)
+				s32 end_new, s32 curr, const bool probe_pass)
 {
 	const s32 off_min = S16_MIN, off_max = S16_MAX;
 	s32 delta = end_new - end_old;
-- 
cgit v1.2.3


From ecebc5ce59a003163eb608ace38a01d7ffeb0a95 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 22 Mar 2019 18:52:27 -0700
Subject: kdb: Get rid of broken attempt to print CCVERSION in kdb summary

If you drop into kdb and type "summary", it prints out a line that
says this:

  ccversion  CCVERSION

...and I don't mean that it actually prints out the version of the C
compiler.  It literally prints out the string "CCVERSION".

The version of the C Compiler is already printed at boot up and it
doesn't seem useful to replicate this in kdb.  Let's just delete it.
We can also delete the bit of the Makefile that called the C compiler
in an attempt to pass this into kdb.  This will remove one extra call
to the C compiler at Makefile parse time and (very slightly) speed up
builds.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/Makefile   | 1 -
 kernel/debug/kdb/kdb_main.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
 # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
 #
 
-CCVERSION	:= $(shell $(CC) -v 2>&1 | sed -ne '$$p')
 obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
 obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o
 
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 82a3b32a7cfc..fc96dbf8d9de 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)
 	kdb_printf("machine    %s\n", init_uts_ns.name.machine);
 	kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);
 	kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
-	kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
 
 	now = __ktime_get_real_seconds();
 	time64_to_tm(now, 0, &tm);
-- 
cgit v1.2.3


From b586627e10f57ee3aa8f0cfab0d6f7dc4ae63760 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 6 May 2019 15:50:18 +0300
Subject: kdb: do a sanity check on the cpu in kdb_per_cpu()

The "whichcpu" comes from argv[3].  The cpu_online() macro looks up the
cpu in a bitmap of online cpus, but if the value is too high then it
could read beyond the end of the bitmap and possibly Oops.

Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index fc96dbf8d9de..9ecfa37c7fbf 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2583,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)
 		diag = kdbgetularg(argv[3], &whichcpu);
 		if (diag)
 			return diag;
-		if (!cpu_online(whichcpu)) {
+		if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
 			kdb_printf("cpu %ld is not online\n", whichcpu);
 			return KDB_BADCPUNUM;
 		}
-- 
cgit v1.2.3


From e2f7fc0ac6957cabff4cecf6c721979b571af208 Mon Sep 17 00:00:00 2001
From: Krzesimir Nowak <krzesimir@kinvolk.io>
Date: Wed, 8 May 2019 18:08:58 +0200
Subject: bpf: fix undefined behavior in narrow load handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 31fd85816dbe ("bpf: permits narrower load from bpf program
context fields") made the verifier add AND instructions to clear the
unwanted bits with a mask when doing a narrow load. The mask is
computed with

  (1 << size * 8) - 1

where "size" is the size of the narrow load. When doing a 4 byte load
of a an 8 byte field the verifier shifts the literal 1 by 32 places to
the left. This results in an overflow of a signed integer, which is an
undefined behavior. Typically, the computed mask was zero, so the
result of the narrow load ended up being zero too.

Cast the literal to long long to avoid overflows. Note that narrow
load of the 4 byte fields does not have the undefined behavior,
because the load size can only be either 1 or 2 bytes, so shifting 1
by 8 or 16 places will not overflow it. And reading 4 bytes would not
be a narrow load of a 4 bytes field.

Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields")
Reviewed-by: Alban Crequy <alban@kinvolk.io>
Reviewed-by: Iago López Galeiras <iago@kinvolk.io>
Signed-off-by: Krzesimir Nowak <krzesimir@kinvolk.io>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7b05e8938d5c..95f9354495ad 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7599,7 +7599,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 									insn->dst_reg,
 									shift);
 				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
-								(1 << size * 8) - 1);
+								(1ULL << size * 8) - 1);
 			}
 		}
 
-- 
cgit v1.2.3


From 2baae3545327632167c0180e9ca1d467416f1919 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 May 2019 09:59:16 -0700
Subject: bpf: devmap: fix use-after-free Read in __dev_map_entry_free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

synchronize_rcu() is fine when the rcu callbacks only need
to free memory (kfree_rcu() or direct kfree() call rcu call backs)

__dev_map_entry_free() is a bit more complex, so we need to make
sure that call queued __dev_map_entry_free() callbacks have completed.

sysbot report:

BUG: KASAN: use-after-free in dev_map_flush_old kernel/bpf/devmap.c:365
[inline]
BUG: KASAN: use-after-free in __dev_map_entry_free+0x2a8/0x300
kernel/bpf/devmap.c:379
Read of size 8 at addr ffff8801b8da38c8 by task ksoftirqd/1/18

CPU: 1 PID: 18 Comm: ksoftirqd/1 Not tainted 4.17.0+ #39
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  print_address_description+0x6c/0x20b mm/kasan/report.c:256
  kasan_report_error mm/kasan/report.c:354 [inline]
  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
  dev_map_flush_old kernel/bpf/devmap.c:365 [inline]
  __dev_map_entry_free+0x2a8/0x300 kernel/bpf/devmap.c:379
  __rcu_reclaim kernel/rcu/rcu.h:178 [inline]
  rcu_do_batch kernel/rcu/tree.c:2558 [inline]
  invoke_rcu_callbacks kernel/rcu/tree.c:2818 [inline]
  __rcu_process_callbacks kernel/rcu/tree.c:2785 [inline]
  rcu_process_callbacks+0xe9d/0x1760 kernel/rcu/tree.c:2802
  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284
  run_ksoftirqd+0x86/0x100 kernel/softirq.c:645
  smpboot_thread_fn+0x417/0x870 kernel/smpboot.c:164
  kthread+0x345/0x410 kernel/kthread.c:240
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

Allocated by task 6675:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
  kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620
  kmalloc include/linux/slab.h:513 [inline]
  kzalloc include/linux/slab.h:706 [inline]
  dev_map_alloc+0x208/0x7f0 kernel/bpf/devmap.c:102
  find_and_alloc_map kernel/bpf/syscall.c:129 [inline]
  map_create+0x393/0x1010 kernel/bpf/syscall.c:453
  __do_sys_bpf kernel/bpf/syscall.c:2351 [inline]
  __se_sys_bpf kernel/bpf/syscall.c:2328 [inline]
  __x64_sys_bpf+0x303/0x510 kernel/bpf/syscall.c:2328
  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 26:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
  __cache_free mm/slab.c:3498 [inline]
  kfree+0xd9/0x260 mm/slab.c:3813
  dev_map_free+0x4fa/0x670 kernel/bpf/devmap.c:191
  bpf_map_free_deferred+0xba/0xf0 kernel/bpf/syscall.c:262
  process_one_work+0xc64/0x1b70 kernel/workqueue.c:2153
  worker_thread+0x181/0x13a0 kernel/workqueue.c:2296
  kthread+0x345/0x410 kernel/kthread.c:240
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The buggy address belongs to the object at ffff8801b8da37c0
  which belongs to the cache kmalloc-512 of size 512
The buggy address is located 264 bytes inside of
  512-byte region [ffff8801b8da37c0, ffff8801b8da39c0)
The buggy address belongs to the page:
page:ffffea0006e368c0 count:1 mapcount:0 mapping:ffff8801da800940
index:0xffff8801b8da3540
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0007217b88 ffffea0006e30cc8 ffff8801da800940
raw: ffff8801b8da3540 ffff8801b8da3040 0000000100000004 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8801b8da3780: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
  ffff8801b8da3800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff8801b8da3880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                               ^
  ffff8801b8da3900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff8801b8da3980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc

Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot+457d3e2ffbcf31aee5c0@syzkaller.appspotmail.com
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/devmap.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..1e525d70f833 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -164,6 +164,9 @@ static void dev_map_free(struct bpf_map *map)
 	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
+	/* Make sure prior __dev_map_entry_free() have completed. */
+	rcu_barrier();
+
 	/* To ensure all pending flush operations have completed wait for flush
 	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
 	 * Because the above synchronize_rcu() ensures the map is disconnected
-- 
cgit v1.2.3


From 390e99cfdda1334f45c718cc02cd26eb3135f233 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 13 May 2019 12:04:36 -0700
Subject: bpf: mark bpf_event_notify and bpf_event_init as static

Both of them are not declared in the headers and not used outside
of bpf_trace.c file.

Fixes: a38d1107f937c ("bpf: support raw tracepoints in modules")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b496ffdf5f36..f92d6ad5e080 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1297,7 +1297,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 }
 
 #ifdef CONFIG_MODULES
-int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
+			    void *module)
 {
 	struct bpf_trace_module *btm, *tmp;
 	struct module *mod = module;
@@ -1336,7 +1337,7 @@ static struct notifier_block bpf_module_nb = {
 	.notifier_call = bpf_event_notify,
 };
 
-int __init bpf_event_init(void)
+static int __init bpf_event_init(void)
 {
 	register_module_notifier(&bpf_module_nb);
 	return 0;
-- 
cgit v1.2.3


From ca976bfb3154c7bc67c4651ecd144fdf67ccaee7 Mon Sep 17 00:00:00 2001
From: Wenlin Kang <wenlin.kang@windriver.com>
Date: Mon, 13 May 2019 16:57:20 +0800
Subject: kdb: Fix bound check compiler warning

The strncpy() function may leave the destination string buffer
unterminated, better use strscpy() instead.

This fixes the following warning with gcc 8.2:

kernel/debug/kdb/kdb_io.c: In function 'kdb_getstr':
kernel/debug/kdb/kdb_io.c:449:3: warning: 'strncpy' specified bound 256 equals destination size [-Wstringop-truncation]
   strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Wenlin Kang <wenlin.kang@windriver.com>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6a4b41484afe..3a5184eb6977 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -446,7 +446,7 @@ poll_again:
 char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
 {
 	if (prompt && kdb_prompt_str != prompt)
-		strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+		strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
 	kdb_printf(kdb_prompt_str);
 	kdb_nextline = 1;	/* Prompt and input resets line number */
 	return kdb_read(buffer, bufsize);
-- 
cgit v1.2.3


From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 13 May 2019 17:15:40 -0700
Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map()

While validating new map we require the @start_data to be strictly less
than @end_data, which is fine for regular applications (this is why this
nit didn't trigger for that long).  These members are set from executable
loaders such as elf handers, still it is pretty valid to have a loadable
data section with zero size in file, in such case the start_data is equal
to end_data once kernel loader finishes.

As a result when we're trying to restore such programs the procedure fails
and the kernel returns -EINVAL.  From the image dump of a program:

 | "mm_start_code": "0x400000",
 | "mm_end_code": "0x8f5fb4",
 | "mm_start_data": "0xf1bfb0",
 | "mm_end_data": "0xf1bfb0",

Thus we need to change validate_prctl_map from strictly less to less or
equal operator use.

Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan
Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation")
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Andrey Vagin <avagin@gmail.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 12df0e5434b8..bdbfe8d37418 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
 	((unsigned long)prctl_map->__m1 __op				\
 	 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
 	error  = __prctl_check_order(start_code, <, end_code);
-	error |= __prctl_check_order(start_data, <, end_data);
+	error |= __prctl_check_order(start_data,<=, end_data);
 	error |= __prctl_check_order(start_brk, <=, brk);
 	error |= __prctl_check_order(arg_start, <=, arg_end);
 	error |= __prctl_check_order(env_start, <=, env_end);
-- 
cgit v1.2.3


From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 13 May 2019 17:16:41 -0700
Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd

Userfaultfd can be misued to make it easier to exploit existing
use-after-free (and similar) bugs that might otherwise only make a
short window or race condition available.  By using userfaultfd to
stall a kernel thread, a malicious program can keep some state that it
wrote, stable for an extended period, which it can then access using an
existing exploit.  While it doesn't cause the exploit itself, and while
it's not the only thing that can stall a kernel thread when accessing a
memory location, it's one of the few that never needs privilege.

We can add a flag, allowing userfaultfd to be restricted, so that in
general it won't be useable by arbitrary user programs, but in
environments that require userfaultfd it can be turned back on.

Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
whether userfaultfd is allowed by unprivileged users.  When this is
set to zero, only privileged users (root user, or users with the
CAP_SYS_PTRACE capability) will be able to use the userfaultfd
syscalls.

Andrea said:

: The only difference between the bpf sysctl and the userfaultfd sysctl
: this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability
: requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement,
: because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE
: already if it's doing other kind of tracking on processes runtime, in
: addition of userfaultfd.  In other words both syscalls works only for
: root, when the two sysctl are opt-in set to 1.

[dgilbert@redhat.com: changelog additions]
[akpm@linux-foundation.org: documentation tweak, per Mike]
Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Suggested-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Maya Gokhale <gokhale2@llnl.gov>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Martin Cracauer <cracauer@cons.org>
Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
Cc: Marty McFadden <mcfadden8@llnl.gov>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 599510a3355e..ba158f61aab4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
 
 #include "../lib/kstrtox.h"
 
@@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= (void *)&mmap_rnd_compat_bits_min,
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
+#endif
+#ifdef CONFIG_USERFAULTFD
+	{
+		.procname	= "unprivileged_userfaultfd",
+		.data		= &sysctl_unprivileged_userfaultfd,
+		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{ }
 };
-- 
cgit v1.2.3


From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Mon, 13 May 2019 17:17:11 -0700
Subject: mm/gup: change GUP fast to use flags rather than a write 'bool'

To facilitate additional options to get_user_pages_fast() change the
singular write parameter to be gup_flags.

This patch does not change any functionality.  New functionality will
follow in subsequent patches.

Some of the get_user_pages_fast() call sites were unchanged because they
already passed FOLL_WRITE or 0 for the write parameter.

NOTE: It was suggested to change the ordering of the get_user_pages_fast()
arguments to ensure that callers were converted.  This breaks the current
GUP call site convention of having the returned pages be the final
parameter.  So the suggestion was rejected.

Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Mike Marshall <hubcap@omnibond.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6262f1534ac9..2268b97d5439 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -543,7 +543,7 @@ again:
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
-	err = get_user_pages_fast(address, 1, 1, &page);
+	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
 	/*
 	 * If write access is not required (eg. FUTEX_WAIT), try
 	 * and get read-only access.
-- 
cgit v1.2.3


From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Mon, 13 May 2019 17:20:49 -0700
Subject: mm/mmu_notifier: contextual information for event triggering
 invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as
a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them.  While current API only provide range of virtual
address affected by the change, not why the changes is happening.

This patchset do the initial mechanical convertion of all the places that
calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP
event as well as the vma if it is know (most invalidation happens against
a given vma).  Passing down the vma allows the users of mmu notifier to
inspect the new vma page protection.

The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier
should assume that every for the range is going away when that event
happens.  A latter patch do convert mm call path to use a more appropriate
events for each call.

This is done as 2 patches so that no call site is forgotten especialy
as it uses this following coccinelle patch:

%<----------------------------------------------------------------------
@@
identifier I1, I2, I3, I4;
@@
static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1,
+enum mmu_notifier_event event,
+unsigned flags,
+struct vm_area_struct *vma,
struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... }

@@
@@
-#define mmu_notifier_range_init(range, mm, start, end)
+#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end)

@@
expression E1, E3, E4;
identifier I1;
@@
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, I1,
I1->vm_mm, E3, E4)
...>

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(..., struct vm_area_struct *VMA, ...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(...) {
struct vm_area_struct *VMA;
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN;
@@
FN(...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, NULL,
E2, E3, E4)
...> }
---------------------------------------------------------------------->%

Applied with:
spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place
spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place
spatch --sp-file mmu-notifier.spatch --dir mm --in-place

Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4ca7364c956d..e34b699f3865 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	struct mmu_notifier_range range;
 	struct mem_cgroup *memcg;
 
-	mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr,
+				addr + PAGE_SIZE);
 
 	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
 
-- 
cgit v1.2.3


From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Mon, 13 May 2019 17:20:53 -0700
Subject: mm/mmu_notifier: use correct mmu_notifier events for each
 invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This updates each existing invalidation to use the correct mmu notifier
event that represent what is happening to the CPU page table.  See the
patch which introduced the events to see the rational behind this.

Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e34b699f3865..78f61bfc6b79 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	struct mmu_notifier_range range;
 	struct mem_cgroup *memcg;
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr,
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
 				addr + PAGE_SIZE);
 
 	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-- 
cgit v1.2.3


From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 13 May 2019 17:21:26 -0700
Subject: mm, memory_hotplug: provide a more generic restrictions for memory
 hotplug

arch_add_memory, __add_pages take a want_memblock which controls whether
the newly added memory should get the sysfs memblock user API (e.g.
ZONE_DEVICE users do not want/need this interface).  Some callers even
want to control where do we allocate the memmap from by configuring
altmap.

Add a more generic hotplug context for arch_add_memory and __add_pages.
struct mhp_restrictions contains flags which contains additional features
to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and
altmap for alternative memmap allocator.

This patch shouldn't introduce any functional change.

[akpm@linux-foundation.org: build fix]
Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..4e59d29245f4 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
+	struct mhp_restrictions restrictions = {
+		/*
+		 * We do not want any optional features only our own memmap
+		*/
+		.altmap = altmap,
+	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
@@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	 */
 	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		error = add_pages(nid, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, NULL, false);
+				align_size >> PAGE_SHIFT, &restrictions);
 	} else {
 		error = kasan_add_zero_shadow(__va(align_start), align_size);
 		if (error) {
@@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			goto err_kasan;
 		}
 
-		error = arch_add_memory(nid, align_start, align_size, altmap,
-				false);
+		error = arch_add_memory(nid, align_start, align_size,
+					&restrictions);
 	}
 
 	if (!error) {
-- 
cgit v1.2.3


From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 13 May 2019 17:22:59 -0700
Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out

Most architectures do not need the memblock memory after the page
allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the
arch Kconfig.

Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the
logic makes it clear which architectures actually use memblock after
system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK
to the architectures that are still missing that option.

Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f7fb8f6a688f..072b6ee55e3f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
 	return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
-			       int (*func)(struct resource *, void *))
-{
-	return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
 static int kexec_walk_memblock(struct kexec_buf *kbuf,
 			       int (*func)(struct resource *, void *))
 {
@@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 
 	return ret;
 }
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+			       int (*func)(struct resource *, void *))
+{
+	return 0;
+}
 #endif
 
 /**
@@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
 		return 0;
 
-	if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+	if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
 		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
 	else
 		ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
-- 
cgit v1.2.3


From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 13 May 2019 17:23:23 -0700
Subject: kernel/memremap.c: remove the unused device_private_entry_fault()
 export

This export has been entirely unused since it was added more than 1 1/2
years ago.

Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 4e59d29245f4..1490e63f69a9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 	 */
 	return devmem->page_fault(vma, addr, page, flags, pmdp);
 }
-EXPORT_SYMBOL(device_private_entry_fault);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 static void pgmap_array_delete(struct resource *res)
-- 
cgit v1.2.3


From c6110222c6f49ea68169f353565eb865488a8619 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 May 2019 01:18:55 +0200
Subject: bpf: add map_lookup_elem_sys_only for lookups from syscall side

Add a callback map_lookup_elem_sys_only() that map implementations
could use over map_lookup_elem() from system call side in case the
map implementation needs to handle the latter differently than from
the BPF data path. If map_lookup_elem_sys_only() is set, this will
be preferred pick for map lookups out of user space. This hook is
used in a follow-up fix for LRU map, but once development window
opens, we can convert other map types from map_lookup_elem() (here,
the one called upon BPF_MAP_LOOKUP_ELEM cmd is meant) over to use
the callback to simplify and clean up the latter.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ad3ccf82f31d..cb5440b02e82 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -808,7 +808,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = map->ops->map_peek_elem(map, value);
 	} else {
 		rcu_read_lock();
-		ptr = map->ops->map_lookup_elem(map, key);
+		if (map->ops->map_lookup_elem_sys_only)
+			ptr = map->ops->map_lookup_elem_sys_only(map, key);
+		else
+			ptr = map->ops->map_lookup_elem(map, key);
 		if (IS_ERR(ptr)) {
 			err = PTR_ERR(ptr);
 		} else if (!ptr) {
-- 
cgit v1.2.3


From 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 May 2019 01:18:56 +0200
Subject: bpf, lru: avoid messing with eviction heuristics upon syscall lookup

One of the biggest issues we face right now with picking LRU map over
regular hash table is that a map walk out of user space, for example,
to just dump the existing entries or to remove certain ones, will
completely mess up LRU eviction heuristics and wrong entries such
as just created ones will get evicted instead. The reason for this
is that we mark an entry as "in use" via bpf_lru_node_set_ref() from
system call lookup side as well. Thus upon walk, all entries are
being marked, so information of actual least recently used ones
are "lost".

In case of Cilium where it can be used (besides others) as a BPF
based connection tracker, this current behavior causes disruption
upon control plane changes that need to walk the map from user space
to evict certain entries. Discussion result from bpfconf [0] was that
we should simply just remove marking from system call side as no
good use case could be found where it's actually needed there.
Therefore this patch removes marking for regular LRU and per-CPU
flavor. If there ever should be a need in future, the behavior could
be selected via map creation flag, but due to mentioned reason we
avoid this here.

  [0] http://vger.kernel.org/bpfconf.html

Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH")
Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 192d32e77db3..0f2708fde5f7 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	return insn - insn_buf;
 }
 
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+							void *key, const bool mark)
 {
 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
 
 	if (l) {
-		bpf_lru_node_set_ref(&l->lru_node);
+		if (mark)
+			bpf_lru_node_set_ref(&l->lru_node);
 		return l->key + round_up(map->key_size, 8);
 	}
 
 	return NULL;
 }
 
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+	return __htab_lru_map_lookup_elem(map, key, false);
+}
+
 static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 				   struct bpf_insn *insn_buf)
 {
@@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_lru_map_lookup_elem,
+	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
 	.map_update_elem = htab_lru_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l;
 	void __percpu *pptr;
 	int ret = -ENOENT;
@@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 	l = __htab_map_lookup_elem(map, key);
 	if (!l)
 		goto out;
-	if (htab_is_lru(htab))
-		bpf_lru_node_set_ref(&l->lru_node);
+	/* We do not mark LRU map element here in order to not mess up
+	 * eviction heuristics when user space does a map walk.
+	 */
 	pptr = htab_elem_get_ptr(l, map->key_size);
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(value + off,
-- 
cgit v1.2.3


From acb2ec3dd003b50b6fb5772057a08ec0dc45d42a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 14 May 2019 15:40:43 -0700
Subject: kernel/Makefile: don't assume that kernel/gen_ikh_data.sh is
 executable

If the user downloads and applies patch-5.1.gz using patch(1), the x bit
on kernel/gen_ikh_data.sh is not set.

  /bin/sh: 1: ./kernel/gen_ikh_data.sh: Permission denied

Fix this by using CONFIG_SHELL.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 298437bb2c6a..33824f0385b3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
 	$(call cmd,genikh)
 
-- 
cgit v1.2.3


From c3f3ce049f7d97cc7ec9c01cb51d9ec74e0f37c2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 14 May 2019 15:40:46 -0700
Subject: userfaultfd: use RCU to free the task struct when fork fails

The task structure is freed while get_mem_cgroup_from_mm() holds
rcu_read_lock() and dereferences mm->owner.

  get_mem_cgroup_from_mm()                failing fork()
  ----                                    ---
  task = mm->owner
                                          mm->owner = NULL;
                                          free(task)
  if (task) *task; /* use after free */

The fix consists in freeing the task with RCU also in the fork failure
case, exactly like it always happens for the regular exit(2) path.  That
is enough to make the rcu_read_lock hold in get_mem_cgroup_from_mm()
(left side above) effective to avoid a use after free when dereferencing
the task structure.

An alternate possible fix would be to defer the delivery of the
userfaultfd contexts to the monitor until after fork() is guaranteed to
succeed.  Such a change would require more changes because it would
create a strict ordering dependency where the uffd methods would need to
be called beyond the last potentially failing branch in order to be
safe.  This solution as opposed only adds the dependency to common code
to set mm->owner to NULL and to free the task struct that was pointed by
mm->owner with RCU, if fork ends up failing.  The userfaultfd methods
can still be called anywhere during the fork runtime and the monitor
will keep discarding orphaned "mm" coming from failed forks in userland.

This race condition couldn't trigger if CONFIG_MEMCG was set =n at build
time.

[aarcange@redhat.com: improve changelog, reduce #ifdefs per Michal]
  Link: http://lkml.kernel.org/r/20190429035752.4508-1-aarcange@redhat.com
Link: http://lkml.kernel.org/r/20190325225636.11635-2-aarcange@redhat.com
Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Tested-by: zhong jiang <zhongjiang@huawei.com>
Reported-by: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: zhong jiang <zhongjiang@huawei.com>
Cc: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 737db1828437..b409e792aadc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+					   struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+	if (mm->owner == p)
+		WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 #ifdef CONFIG_MEMCG
@@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
 free_pt:
 	/* don't put binfmt in mmput, we haven't got module yet */
 	mm->binfmt = NULL;
+	mm_init_owner(mm, NULL);
 	mmput(mm);
 
 fail_nomem:
@@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid)
 	return fd;
 }
 
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+	if (IS_ENABLED(CONFIG_MEMCG))
+		call_rcu(&tsk->rcu, __delayed_free_task);
+	else
+		free_task(tsk);
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2233,8 +2258,10 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -2265,7 +2292,7 @@ bad_fork_cleanup_count:
 bad_fork_free:
 	p->state = TASK_DEAD;
 	put_task_stack(p);
-	free_task(p);
+	delayed_free_task(p);
 fork_out:
 	spin_lock_irq(&current->sighand->siglock);
 	hlist_del_init(&delayed.node);
-- 
cgit v1.2.3


From 987717e5e016a0dd3011d3bb16546672713f94e2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 14 May 2019 15:40:50 -0700
Subject: mm: change mm_update_next_owner() to update mm->owner with WRITE_ONCE

The RCU reader uses rcu_dereference() inside rcu_read_lock critical
sections, so the writer shall use WRITE_ONCE.  Just a cleanup, we still
rely on gcc to emit atomic writes in other places.

Link: http://lkml.kernel.org/r/20190325225636.11635-3-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..8361a560cd1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,7 +422,7 @@ retry:
 	 * freed task structure.
 	 */
 	if (atomic_read(&mm->mm_users) <= 1) {
-		mm->owner = NULL;
+		WRITE_ONCE(mm->owner, NULL);
 		return;
 	}
 
@@ -462,7 +462,7 @@ retry:
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
 	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
-	mm->owner = NULL;
+	WRITE_ONCE(mm->owner, NULL);
 	return;
 
 assign_new_owner:
@@ -483,7 +483,7 @@ assign_new_owner:
 		put_task_struct(c);
 		goto retry;
 	}
-	mm->owner = c;
+	WRITE_ONCE(mm->owner, c);
 	task_unlock(c);
 	put_task_struct(c);
 }
-- 
cgit v1.2.3


From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:40:56 -0700
Subject: psi: introduce state_mask to represent stalled psi states

Patch series "psi: pressure stall monitors", v6.

This is a respin of:
  https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/

Android is adopting psi to detect and remedy memory pressure that
results in stuttering and decreased responsiveness on mobile devices.

Psi gives us the stall information, but because we're dealing with
latencies in the millisecond range, periodically reading the pressure
files to detect stalls in a timely fashion is not feasible.  Psi also
doesn't aggregate its averages at a high-enough frequency right now.

This patch series extends the psi interface such that users can
configure sensitive latency thresholds and use poll() and friends to be
notified when these are breached.

As high-frequency aggregation is costly, it implements an aggregation
method that is optimized for fast, short-interval averaging, and makes
the aggregation frequency adaptive, such that high-frequency updates
only happen while monitored stall events are actively occurring.

With these patches applied, Android can monitor for, and ward off,
mounting memory shortages before they cause problems for the user.  For
example, using memory stall monitors in userspace low memory killer
daemon (lmkd) we can detect mounting pressure and kill less important
processes before device becomes visibly sluggish.  In our memory stress
testing psi memory monitors produce roughly 10x less false positives
compared to vmpressure signals.  Having ability to specify multiple
triggers for the same psi metric allows other parts of Android framework
to monitor memory state of the device and act accordingly.

The new interface is straight-forward.  The user opens one of the
pressure files for writing and writes a trigger description into the
file descriptor that defines the stall state - some or full, and the
maximum stall time over a given window of time.  E.g.:

        /* Signal when stall time exceeds 100ms of a 1s window */
        char trigger[] = "full 100000 1000000"
        fd = open("/proc/pressure/memory")
        write(fd, trigger, sizeof(trigger))
        while (poll() >= 0) {
                ...
        };
        close(fd);

When the monitored stall state is entered, psi adapts its aggregation
frequency according to what the configured time window requires in order
to emit event signals in a timely fashion.  Once the stalling subsides,
aggregation reverts back to normal.

The trigger is associated with the open file descriptor.  To stop
monitoring, the user only needs to close the file descriptor and the
trigger is discarded.

Patches 1-6 prepare the psi code for polling support.  Patch 7
implements the adaptive polling logic, the pressure growth detection
optimized for short intervals, and hooks up write() and poll() on the
pressure files.

The patches were developed in collaboration with Johannes Weiner.

This patch (of 7):

The psi monitoring patches will need to determine the same states as
record_times().  To avoid calculating them twice, maintain a state mask
that can be consulted cheaply.  Do this in a separate patch to keep the
churn in the main feature patch at a minimum.

This adds 4-byte state_mask member into psi_group_cpu struct which
results in its first cacheline-aligned part becoming 52 bytes long.  Add
explicit values to enumeration element counters that affect
psi_group_cpu struct size.

Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0e97ca9306ef..22c1505ad290 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
-	unsigned int tasks[NR_PSI_TASK_COUNTS];
 	u64 now, state_start;
+	enum psi_states s;
 	unsigned int seq;
-	int s;
+	u32 state_mask;
 
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = read_seqcount_begin(&groupc->seq);
 		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
-		memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
 	} while (read_seqcount_retry(&groupc->seq, seq));
 
@@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 		 * (u32) and our reported pressure close to what's
 		 * actually happening.
 		 */
-		if (test_state(tasks, s))
+		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
 		delta = times[s] - groupc->times_prev[s];
@@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 	delta = now - groupc->state_start;
 	groupc->state_start = now;
 
-	if (test_state(groupc->tasks, PSI_IO_SOME)) {
+	if (groupc->state_mask & (1 << PSI_IO_SOME)) {
 		groupc->times[PSI_IO_SOME] += delta;
-		if (test_state(groupc->tasks, PSI_IO_FULL))
+		if (groupc->state_mask & (1 << PSI_IO_FULL))
 			groupc->times[PSI_IO_FULL] += delta;
 	}
 
-	if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+	if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
 		groupc->times[PSI_MEM_SOME] += delta;
-		if (test_state(groupc->tasks, PSI_MEM_FULL))
+		if (groupc->state_mask & (1 << PSI_MEM_FULL))
 			groupc->times[PSI_MEM_FULL] += delta;
 		else if (memstall_tick) {
 			u32 sample;
@@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		}
 	}
 
-	if (test_state(groupc->tasks, PSI_CPU_SOME))
+	if (groupc->state_mask & (1 << PSI_CPU_SOME))
 		groupc->times[PSI_CPU_SOME] += delta;
 
-	if (test_state(groupc->tasks, PSI_NONIDLE))
+	if (groupc->state_mask & (1 << PSI_NONIDLE))
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
@@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
+	enum psi_states s;
+	u32 state_mask = 0;
 
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
@@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (set & (1 << t))
 			groupc->tasks[t]++;
 
+	/* Calculate state mask representing active states */
+	for (s = 0; s < NR_PSI_STATES; s++) {
+		if (test_state(groupc->tasks, s))
+			state_mask |= (1 << s);
+	}
+	groupc->state_mask = state_mask;
+
 	write_seqcount_end(&groupc->seq);
 }
 
-- 
cgit v1.2.3


From 9289c5e6a78a5a9397df5fa60eb82b105abcfecf Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:40:59 -0700
Subject: psi: make psi_enable static

psi_enable is not used outside of psi.c, make it static.

Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 22c1505ad290..281702de9772 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,9 +140,9 @@ static int psi_bug __read_mostly;
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
 
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
 #else
-bool psi_enable = true;
+static bool psi_enable = true;
 #endif
 static int __init setup_psi(char *str)
 {
-- 
cgit v1.2.3


From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:02 -0700
Subject: psi: rename psi fields in preparation for psi trigger addition

Rename psi_group structure member fields used for calculating psi totals
and averages for clear distinction between them and for trigger-related
fields that will be added by "psi: introduce psi monitor".

[surenb@google.com: v6]
  Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com
Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 281702de9772..4fb4d9913bc8 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -165,7 +165,7 @@ static struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
 
 static void group_init(struct psi_group *group)
 {
@@ -173,9 +173,9 @@ static void group_init(struct psi_group *group)
 
 	for_each_possible_cpu(cpu)
 		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
-	group->next_update = sched_clock() + psi_period;
-	INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
-	mutex_init(&group->stat_lock);
+	group->avg_next_update = sched_clock() + psi_period;
+	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+	mutex_init(&group->avgs_lock);
 }
 
 void __init psi_init(void)
@@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group)
 	int cpu;
 	int s;
 
-	mutex_lock(&group->stat_lock);
+	mutex_lock(&group->avgs_lock);
 
 	/*
 	 * Collect the per-cpu time buckets and average them into a
@@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group)
 
 	/* avgX= */
 	now = sched_clock();
-	expires = group->next_update;
+	expires = group->avg_next_update;
 	if (now < expires)
 		goto out;
 	if (now - expires >= psi_period)
@@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group)
 	 * But the deltas we sample out of the per-cpu buckets above
 	 * are based on the actual time elapsing between clock ticks.
 	 */
-	group->next_update = expires + ((1 + missed_periods) * psi_period);
-	period = now - (group->last_update + (missed_periods * psi_period));
-	group->last_update = now;
+	group->avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	period = now - (group->avg_last_update + (missed_periods * psi_period));
+	group->avg_last_update = now;
 
 	for (s = 0; s < NR_PSI_STATES - 1; s++) {
 		u32 sample;
 
-		sample = group->total[s] - group->total_prev[s];
+		sample = group->total[s] - group->avg_total[s];
 		/*
 		 * Due to the lockless sampling of the time buckets,
 		 * recorded time deltas can slip into the next period,
@@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group)
 		 */
 		if (sample > period)
 			sample = period;
-		group->total_prev[s] += sample;
+		group->avg_total[s] += sample;
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
 out:
-	mutex_unlock(&group->stat_lock);
+	mutex_unlock(&group->avgs_lock);
 	return nonidle_total;
 }
 
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	bool nonidle;
 
 	dwork = to_delayed_work(work);
-	group = container_of(dwork, struct psi_group, clock_work);
+	group = container_of(dwork, struct psi_group, avgs_work);
 
 	/*
 	 * If there is task activity, periodically fold the per-cpu
@@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work)
 		u64 now;
 
 		now = sched_clock();
-		if (group->next_update > now)
-			delay = nsecs_to_jiffies(group->next_update - now) + 1;
+		if (group->avg_next_update > now)
+			delay = nsecs_to_jiffies(
+					group->avg_next_update - now) + 1;
 		schedule_delayed_work(dwork, delay);
 	}
 }
@@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	 */
 	if (unlikely((clear & TSK_RUNNING) &&
 		     (task->flags & PF_WQ_WORKER) &&
-		     wq_worker_last_func(task) == psi_update_work))
+		     wq_worker_last_func(task) == psi_avgs_work))
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter))) {
 		psi_group_change(group, cpu, clear, set);
-		if (wake_clock && !delayed_work_pending(&group->clock_work))
-			schedule_delayed_work(&group->clock_work, PSI_FREQ);
+		if (wake_clock && !delayed_work_pending(&group->avgs_work))
+			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 	}
 }
 
@@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	cancel_delayed_work_sync(&cgroup->psi.clock_work);
+	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
 	free_percpu(cgroup->psi.pcpu);
 }
 
-- 
cgit v1.2.3


From 7fc70a3999366560ad1d4f2389a78360300c2c6a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:06 -0700
Subject: psi: split update_stats into parts

Split update_stats into collect_percpu_times and update_averages for
collect_percpu_times to be reused later inside psi monitor.

Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 4fb4d9913bc8..ace5ed97b186 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static bool update_stats(struct psi_group *group)
+static bool collect_percpu_times(struct psi_group *group)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
-	unsigned long missed_periods = 0;
 	unsigned long nonidle_total = 0;
-	u64 now, expires, period;
 	int cpu;
 	int s;
 
-	mutex_lock(&group->avgs_lock);
-
 	/*
 	 * Collect the per-cpu time buckets and average them into a
 	 * single time sample that is normalized to wallclock time.
@@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group)
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
 		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
 
+	return nonidle_total;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+	unsigned long missed_periods = 0;
+	u64 expires, period;
+	u64 avg_next_update;
+	int s;
+
 	/* avgX= */
-	now = sched_clock();
 	expires = group->avg_next_update;
-	if (now < expires)
-		goto out;
 	if (now - expires >= psi_period)
 		missed_periods = div_u64(now - expires, psi_period);
 
@@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group)
 	 * But the deltas we sample out of the per-cpu buckets above
 	 * are based on the actual time elapsing between clock ticks.
 	 */
-	group->avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	avg_next_update = expires + ((1 + missed_periods) * psi_period);
 	period = now - (group->avg_last_update + (missed_periods * psi_period));
 	group->avg_last_update = now;
 
@@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group)
 		group->avg_total[s] += sample;
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
-out:
-	mutex_unlock(&group->avgs_lock);
-	return nonidle_total;
+
+	return avg_next_update;
 }
 
 static void psi_avgs_work(struct work_struct *work)
@@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	bool nonidle;
+	u64 now;
 
 	dwork = to_delayed_work(work);
 	group = container_of(dwork, struct psi_group, avgs_work);
 
+	mutex_lock(&group->avgs_lock);
+
+	now = sched_clock();
+
+	nonidle = collect_percpu_times(group);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work)
 	 * Once restarted, we'll catch up the running averages in one
 	 * go - see calc_avgs() and missed_periods.
 	 */
-
-	nonidle = update_stats(group);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
 
 	if (nonidle) {
-		unsigned long delay = 0;
-		u64 now;
-
-		now = sched_clock();
-		if (group->avg_next_update > now)
-			delay = nsecs_to_jiffies(
-					group->avg_next_update - now) + 1;
-		schedule_delayed_work(dwork, delay);
+		schedule_delayed_work(dwork, nsecs_to_jiffies(
+				group->avg_next_update - now) + 1);
 	}
+
+	mutex_unlock(&group->avgs_lock);
 }
 
 static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
 	int full;
+	u64 now;
 
 	if (static_branch_likely(&psi_disabled))
 		return -EOPNOTSUPP;
 
-	update_stats(group);
+	/* Update averages before reporting them */
+	mutex_lock(&group->avgs_lock);
+	now = sched_clock();
+	collect_percpu_times(group);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
+	mutex_unlock(&group->avgs_lock);
 
 	for (full = 0; full < 2 - (res == PSI_CPU); full++) {
 		unsigned long avg[3];
-- 
cgit v1.2.3


From 333f3017c5a893b000b2b4a3529814ce93fa83d7 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:09 -0700
Subject: psi: track changed states

Introduce changed_states parameter into collect_percpu_times to track
the states changed since the last update.

This will be needed to detect whether polled states activated in the
monitor patch.

Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ace5ed97b186..1b99eeffaa25 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	}
 }
 
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
 	u64 now, state_start;
@@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 	unsigned int seq;
 	u32 state_mask;
 
+	*pchanged_states = 0;
+
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = read_seqcount_begin(&groupc->seq);
@@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 		groupc->times_prev[s] = times[s];
 
 		times[s] = delta;
+		if (delta)
+			*pchanged_states |= (1 << s);
 	}
 }
 
@@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static bool collect_percpu_times(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
+	u32 changed_states = 0;
 	int cpu;
 	int s;
 
@@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group)
 	for_each_possible_cpu(cpu) {
 		u32 times[NR_PSI_STATES];
 		u32 nonidle;
+		u32 cpu_changed_states;
 
-		get_recent_times(group, cpu, times);
+		get_recent_times(group, cpu, times,
+				&cpu_changed_states);
+		changed_states |= cpu_changed_states;
 
 		nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
 		nonidle_total += nonidle;
@@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group)
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
 		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
 
-	return nonidle_total;
+	if (pchanged_states)
+		*pchanged_states = changed_states;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
@@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct psi_group *group;
+	u32 changed_states;
 	bool nonidle;
 	u64 now;
 
@@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work)
 
 	now = sched_clock();
 
-	nonidle = collect_percpu_times(group);
+	collect_percpu_times(group, &changed_states);
+	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	/* Update averages before reporting them */
 	mutex_lock(&group->avgs_lock);
 	now = sched_clock();
-	collect_percpu_times(group);
+	collect_percpu_times(group, NULL);
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
-- 
cgit v1.2.3


From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:12 -0700
Subject: include/: refactor headers to allow kthread.h inclusion in
 psi_types.h

kthread.h can't be included in psi_types.h because it creates a circular
inclusion with kthread.h eventually including psi_types.h and
complaining on kthread structures not being defined because they are
defined further in the kthread.h.  Resolve this by removing psi_types.h
inclusion from the headers included from kthread.h.

Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..be4e8795561a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -11,6 +11,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-- 
cgit v1.2.3


From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:15 -0700
Subject: psi: introduce psi monitor

Psi monitor aims to provide a low-latency short-term pressure detection
mechanism configurable by users.  It allows users to monitor psi metrics
growth and trigger events whenever a metric raises above user-defined
threshold within user-defined time window.

Time window and threshold are both expressed in usecs.  Multiple psi
resources with different thresholds and window sizes can be monitored
concurrently.

Psi monitors activate when system enters stall state for the monitored
psi metric and deactivate upon exit from the stall state.  While system
is in the stall state psi signal growth is monitored at a rate of 10
times per tracking window.  Min window size is 500ms, therefore the min
monitoring interval is 50ms.  Max window size is 10s with monitoring
interval of 1s.

When activated psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when psi
signal is bouncing.

Notifications to the users are rate-limited to one per tracking window.

Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cgroup.c |  71 ++++++-
 kernel/sched/psi.c     | 494 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 547 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..1140357d46f4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
 }
-#endif
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+					  size_t nbytes, enum psi_res res)
+{
+	struct psi_trigger *new;
+	struct cgroup *cgrp;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	cgroup_get(cgrp);
+	cgroup_kn_unlock(of->kn);
+
+	new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+	if (IS_ERR(new)) {
+		cgroup_put(cgrp);
+		return PTR_ERR(new);
+	}
+
+	psi_trigger_replace(&of->priv, new);
+
+	cgroup_put(cgrp);
+
+	return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+					  poll_table *pt)
+{
+	return psi_trigger_poll(&of->priv, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+	psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
 {
@@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = {
 		.name = "io.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_io_pressure_show,
+		.write = cgroup_io_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
 	{
 		.name = "memory.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_memory_pressure_show,
+		.write = cgroup_memory_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
 	{
 		.name = "cpu.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_cpu_pressure_show,
+		.write = cgroup_cpu_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
-#endif
+#endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1b99eeffaa25..e88918e0bb6d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
  * Copyright (c) 2018 Facebook, Inc.
  * Author: Johannes Weiner <hannes@cmpxchg.org>
  *
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
  * When CPU, memory and IO are contended, tasks experience delays that
  * reduce throughput and introduce latencies into the workload. Memory
  * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/seqlock.h>
+#include <linux/uaccess.h>
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
 #include <linux/psi.h>
 #include "sched.h"
 
@@ -156,6 +163,11 @@ __setup("psi=", setup_psi);
 #define EXP_60s		1981		/* 1/exp(2s/60s) */
 #define EXP_300s	2034		/* 1/exp(2s/300s) */
 
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000	/* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000	/* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10	/* 10 updates per window */
+
 /* Sampling frequency in nanoseconds */
 static u64 psi_period __read_mostly;
 
@@ -176,6 +188,17 @@ static void group_init(struct psi_group *group)
 	group->avg_next_update = sched_clock() + psi_period;
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
+	/* Init trigger-related members */
+	atomic_set(&group->poll_scheduled, 0);
+	mutex_init(&group->trigger_lock);
+	INIT_LIST_HEAD(&group->triggers);
+	memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+	group->poll_states = 0;
+	group->poll_min_period = U32_MAX;
+	memset(group->polling_total, 0, sizeof(group->polling_total));
+	group->polling_next_update = ULLONG_MAX;
+	group->polling_until = 0;
+	rcu_assign_pointer(group->poll_kworker, NULL);
 }
 
 void __init psi_init(void)
@@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	}
 }
 
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+static void get_recent_times(struct psi_group *group, int cpu,
+			     enum psi_aggregators aggregator, u32 *times,
 			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
 		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
-		delta = times[s] - groupc->times_prev[s];
-		groupc->times_prev[s] = times[s];
+		delta = times[s] - groupc->times_prev[aggregator][s];
+		groupc->times_prev[aggregator][s] = times[s];
 
 		times[s] = delta;
 		if (delta)
@@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
+static void collect_percpu_times(struct psi_group *group,
+				 enum psi_aggregators aggregator,
+				 u32 *pchanged_states)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
@@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 		u32 nonidle;
 		u32 cpu_changed_states;
 
-		get_recent_times(group, cpu, times,
+		get_recent_times(group, cpu, aggregator, times,
 				&cpu_changed_states);
 		changed_states |= cpu_changed_states;
 
@@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 
 	/* total= */
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
-		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+		group->total[aggregator][s] +=
+				div_u64(deltas[s], max(nonidle_total, 1UL));
 
 	if (pchanged_states)
 		*pchanged_states = changed_states;
@@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now)
 	for (s = 0; s < NR_PSI_STATES - 1; s++) {
 		u32 sample;
 
-		sample = group->total[s] - group->avg_total[s];
+		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
 		/*
 		 * Due to the lockless sampling of the time buckets,
 		 * recorded time deltas can slip into the next period,
@@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work)
 
 	now = sched_clock();
 
-	collect_percpu_times(group, &changed_states);
+	collect_percpu_times(group, PSI_AVGS, &changed_states);
 	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
@@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work)
 	mutex_unlock(&group->avgs_lock);
 }
 
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+			 u64 prev_growth)
+{
+	win->start_time = now;
+	win->start_value = value;
+	win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+	u64 elapsed;
+	u64 growth;
+
+	elapsed = now - win->start_time;
+	growth = value - win->start_value;
+	/*
+	 * After each tracking window passes win->start_value and
+	 * win->start_time get reset and win->prev_growth stores
+	 * the average per-window growth of the previous window.
+	 * win->prev_growth is then used to interpolate additional
+	 * growth from the previous window assuming it was linear.
+	 */
+	if (elapsed > win->size)
+		window_reset(win, now, value, growth);
+	else {
+		u32 remaining;
+
+		remaining = win->size - elapsed;
+		growth += div_u64(win->prev_growth * remaining, win->size);
+	}
+
+	return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+
+	list_for_each_entry(t, &group->triggers, node)
+		window_reset(&t->win, now,
+				group->total[PSI_POLL][t->state], 0);
+	memcpy(group->polling_total, group->total[PSI_POLL],
+		   sizeof(group->polling_total));
+	group->polling_next_update = now + group->poll_min_period;
+}
+
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+	bool new_stall = false;
+	u64 *total = group->total[PSI_POLL];
+
+	/*
+	 * On subsequent updates, calculate growth deltas and let
+	 * watchers know when their specified thresholds are exceeded.
+	 */
+	list_for_each_entry(t, &group->triggers, node) {
+		u64 growth;
+
+		/* Check for stall activity */
+		if (group->polling_total[t->state] == total[t->state])
+			continue;
+
+		/*
+		 * Multiple triggers might be looking at the same state,
+		 * remember to update group->polling_total[] once we've
+		 * been through all of them. Also remember to extend the
+		 * polling time if we see new stall activity.
+		 */
+		new_stall = true;
+
+		/* Calculate growth since last update */
+		growth = window_update(&t->win, now, total[t->state]);
+		if (growth < t->threshold)
+			continue;
+
+		/* Limit event signaling to once per window */
+		if (now < t->last_event_time + t->win.size)
+			continue;
+
+		/* Generate an event */
+		if (cmpxchg(&t->event, 0, 1) == 0)
+			wake_up_interruptible(&t->event_wait);
+		t->last_event_time = now;
+	}
+
+	if (new_stall)
+		memcpy(group->polling_total, total,
+				sizeof(group->polling_total));
+
+	return now + group->poll_min_period;
+}
+
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+	struct kthread_worker *kworker;
+
+	/* Do not reschedule if already scheduled */
+	if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+		return;
+
+	rcu_read_lock();
+
+	kworker = rcu_dereference(group->poll_kworker);
+	/*
+	 * kworker might be NULL in case psi_trigger_destroy races with
+	 * psi_task_change (hotpath) which can't use locks
+	 */
+	if (likely(kworker))
+		kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+	else
+		atomic_set(&group->poll_scheduled, 0);
+
+	rcu_read_unlock();
+}
+
+static void psi_poll_work(struct kthread_work *work)
+{
+	struct kthread_delayed_work *dwork;
+	struct psi_group *group;
+	u32 changed_states;
+	u64 now;
+
+	dwork = container_of(work, struct kthread_delayed_work, work);
+	group = container_of(dwork, struct psi_group, poll_work);
+
+	atomic_set(&group->poll_scheduled, 0);
+
+	mutex_lock(&group->trigger_lock);
+
+	now = sched_clock();
+
+	collect_percpu_times(group, PSI_POLL, &changed_states);
+
+	if (changed_states & group->poll_states) {
+		/* Initialize trigger windows when entering polling mode */
+		if (now > group->polling_until)
+			init_triggers(group, now);
+
+		/*
+		 * Keep the monitor active for at least the duration of the
+		 * minimum tracking window as long as monitor states are
+		 * changing.
+		 */
+		group->polling_until = now +
+			group->poll_min_period * UPDATES_PER_WINDOW;
+	}
+
+	if (now > group->polling_until) {
+		group->polling_next_update = ULLONG_MAX;
+		goto out;
+	}
+
+	if (now >= group->polling_next_update)
+		group->polling_next_update = update_triggers(group, now);
+
+	psi_schedule_poll_work(group,
+		nsecs_to_jiffies(group->polling_next_update - now) + 1);
+
+out:
+	mutex_unlock(&group->trigger_lock);
+}
+
 static void record_times(struct psi_group_cpu *groupc, int cpu,
 			 bool memstall_tick)
 {
@@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
-static void psi_group_change(struct psi_group *group, int cpu,
-			     unsigned int clear, unsigned int set)
+static u32 psi_group_change(struct psi_group *group, int cpu,
+			    unsigned int clear, unsigned int set)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
@@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	groupc->state_mask = state_mask;
 
 	write_seqcount_end(&groupc->seq);
+
+	return state_mask;
 }
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter))) {
-		psi_group_change(group, cpu, clear, set);
+		u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+		if (state_mask & group->poll_states)
+			psi_schedule_poll_work(group, 1);
+
 		if (wake_clock && !delayed_work_pending(&group->avgs_work))
 			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 	}
@@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
 
 	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
 	free_percpu(cgroup->psi.pcpu);
+	/* All triggers must be removed by now */
+	WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
 }
 
 /**
@@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	/* Update averages before reporting them */
 	mutex_lock(&group->avgs_lock);
 	now = sched_clock();
-	collect_percpu_times(group, NULL);
+	collect_percpu_times(group, PSI_AVGS, NULL);
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
@@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 
 		for (w = 0; w < 3; w++)
 			avg[w] = group->avg[res * 2 + full][w];
-		total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+		total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+				NSEC_PER_USEC);
 
 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
 			   full ? "full" : "some",
@@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
 	return single_open(file, psi_cpu_show, NULL);
 }
 
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+			char *buf, size_t nbytes, enum psi_res res)
+{
+	struct psi_trigger *t;
+	enum psi_states state;
+	u32 threshold_us;
+	u32 window_us;
+
+	if (static_branch_likely(&psi_disabled))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+		state = PSI_IO_SOME + res * 2;
+	else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+		state = PSI_IO_FULL + res * 2;
+	else
+		return ERR_PTR(-EINVAL);
+
+	if (state >= PSI_NONIDLE)
+		return ERR_PTR(-EINVAL);
+
+	if (window_us < WINDOW_MIN_US ||
+		window_us > WINDOW_MAX_US)
+		return ERR_PTR(-EINVAL);
+
+	/* Check threshold */
+	if (threshold_us == 0 || threshold_us > window_us)
+		return ERR_PTR(-EINVAL);
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	t->group = group;
+	t->state = state;
+	t->threshold = threshold_us * NSEC_PER_USEC;
+	t->win.size = window_us * NSEC_PER_USEC;
+	window_reset(&t->win, 0, 0, 0);
+
+	t->event = 0;
+	t->last_event_time = 0;
+	init_waitqueue_head(&t->event_wait);
+	kref_init(&t->refcount);
+
+	mutex_lock(&group->trigger_lock);
+
+	if (!rcu_access_pointer(group->poll_kworker)) {
+		struct sched_param param = {
+			.sched_priority = MAX_RT_PRIO - 1,
+		};
+		struct kthread_worker *kworker;
+
+		kworker = kthread_create_worker(0, "psimon");
+		if (IS_ERR(kworker)) {
+			kfree(t);
+			mutex_unlock(&group->trigger_lock);
+			return ERR_CAST(kworker);
+		}
+		sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+		kthread_init_delayed_work(&group->poll_work,
+				psi_poll_work);
+		rcu_assign_pointer(group->poll_kworker, kworker);
+	}
+
+	list_add(&t->node, &group->triggers);
+	group->poll_min_period = min(group->poll_min_period,
+		div_u64(t->win.size, UPDATES_PER_WINDOW));
+	group->nr_triggers[t->state]++;
+	group->poll_states |= (1 << t->state);
+
+	mutex_unlock(&group->trigger_lock);
+
+	return t;
+}
+
+static void psi_trigger_destroy(struct kref *ref)
+{
+	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+	struct psi_group *group = t->group;
+	struct kthread_worker *kworker_to_destroy = NULL;
+
+	if (static_branch_likely(&psi_disabled))
+		return;
+
+	/*
+	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+	 * from under a polling process.
+	 */
+	wake_up_interruptible(&t->event_wait);
+
+	mutex_lock(&group->trigger_lock);
+
+	if (!list_empty(&t->node)) {
+		struct psi_trigger *tmp;
+		u64 period = ULLONG_MAX;
+
+		list_del(&t->node);
+		group->nr_triggers[t->state]--;
+		if (!group->nr_triggers[t->state])
+			group->poll_states &= ~(1 << t->state);
+		/* reset min update period for the remaining triggers */
+		list_for_each_entry(tmp, &group->triggers, node)
+			period = min(period, div_u64(tmp->win.size,
+					UPDATES_PER_WINDOW));
+		group->poll_min_period = period;
+		/* Destroy poll_kworker when the last trigger is destroyed */
+		if (group->poll_states == 0) {
+			group->polling_until = 0;
+			kworker_to_destroy = rcu_dereference_protected(
+					group->poll_kworker,
+					lockdep_is_held(&group->trigger_lock));
+			rcu_assign_pointer(group->poll_kworker, NULL);
+		}
+	}
+
+	mutex_unlock(&group->trigger_lock);
+
+	/*
+	 * Wait for both *trigger_ptr from psi_trigger_replace and
+	 * poll_kworker RCUs to complete their read-side critical sections
+	 * before destroying the trigger and optionally the poll_kworker
+	 */
+	synchronize_rcu();
+	/*
+	 * Destroy the kworker after releasing trigger_lock to prevent a
+	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
+	 */
+	if (kworker_to_destroy) {
+		kthread_cancel_delayed_work_sync(&group->poll_work);
+		kthread_destroy_worker(kworker_to_destroy);
+	}
+	kfree(t);
+}
+
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+	struct psi_trigger *old = *trigger_ptr;
+
+	if (static_branch_likely(&psi_disabled))
+		return;
+
+	rcu_assign_pointer(*trigger_ptr, new);
+	if (old)
+		kref_put(&old->refcount, psi_trigger_destroy);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+				struct file *file, poll_table *wait)
+{
+	__poll_t ret = DEFAULT_POLLMASK;
+	struct psi_trigger *t;
+
+	if (static_branch_likely(&psi_disabled))
+		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+	rcu_read_lock();
+
+	t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+	if (!t) {
+		rcu_read_unlock();
+		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+	}
+	kref_get(&t->refcount);
+
+	rcu_read_unlock();
+
+	poll_wait(file, &t->event_wait, wait);
+
+	if (cmpxchg(&t->event, 1, 0) == 1)
+		ret |= EPOLLPRI;
+
+	kref_put(&t->refcount, psi_trigger_destroy);
+
+	return ret;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+			 size_t nbytes, enum psi_res res)
+{
+	char buf[32];
+	size_t buf_size;
+	struct seq_file *seq;
+	struct psi_trigger *new;
+
+	if (static_branch_likely(&psi_disabled))
+		return -EOPNOTSUPP;
+
+	buf_size = min(nbytes, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	buf[buf_size - 1] = '\0';
+
+	new = psi_trigger_create(&psi_system, buf, nbytes, res);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	seq = file->private_data;
+	/* Take seq->lock to protect seq->private from concurrent writes */
+	mutex_lock(&seq->lock);
+	psi_trigger_replace(&seq->private, new);
+	mutex_unlock(&seq->lock);
+
+	return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+			    size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+			     size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+	struct seq_file *seq = file->private_data;
+
+	return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+
+	psi_trigger_replace(&seq->private, NULL);
+	return single_release(inode, file);
+}
+
 static const struct file_operations psi_io_fops = {
 	.open           = psi_io_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_io_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static const struct file_operations psi_memory_fops = {
 	.open           = psi_memory_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_memory_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static const struct file_operations psi_cpu_fops = {
 	.open           = psi_cpu_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_cpu_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static int __init psi_proc_init(void)
-- 
cgit v1.2.3


From df5ba5be7425e1df296d40c5f37a39d98ec666a2 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <dschatzberg@fb.com>
Date: Tue, 14 May 2019 15:41:18 -0700
Subject: kernel/sched/psi.c: expose pressure metrics on root cgroup

Pressure metrics are already recorded and exposed in procfs for the
entire system, but any tool which monitors cgroup pressure has to
special case the root cgroup to read from procfs.  This patch exposes
the already recorded pressure metrics on the root cgroup.

Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com
Signed-off-by: Dan Schatzberg <dschatzberg@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cgroup.c | 18 ++++++++++++------
 kernel/sched/psi.c     |  2 +-
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1140357d46f4..217cec4e22c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3540,15 +3540,24 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_PSI
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_CPU);
 }
 
 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
@@ -4801,7 +4810,6 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4809,7 +4817,6 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4817,7 +4824,6 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e88918e0bb6d..7acc632c3b82 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -173,7 +173,7 @@ static u64 psi_period __read_mostly;
 
 /* System-level pressure and stall tracking */
 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
-- 
cgit v1.2.3


From 831246570d34692e0550da952d0655bdcc985419 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 14 May 2019 15:42:28 -0700
Subject: kernel/notifier.c: double register detection

By design notifiers can be registerd once only, 2nd register attempt
called by mistake silently corrupts notifiers list.

A few years ago I investigated described problem, the host was power
cycled because of notifier list corruption.  I've prepared this patch
and applied it to the OpenVZ kernel and sent this patch but nobody
commented on it.  Later it helped us to detect a similar problem in the
OpenVz kernel.

Mistakes with notifier registration can happen for example during
subsystem initialization from different namespaces, or because of a lost
unregister in the roll-back path on initialization failures.

The proposed check cannot prevent the described problem, however it
allows us to detect its reason quickly without coredump analysis.

Link: http://lkml.kernel.org/r/04127e71-4782-9bbb-fe5a-7c01e93a99b0@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/notifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..bfc95b3e4235 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
 	while ((*nl) != NULL) {
+		WARN_ONCE(((*nl) == n), "double register detected");
 		if (n->priority > (*nl)->priority)
 			break;
 		nl = &((*nl)->next);
-- 
cgit v1.2.3


From 0cc75888dad112395df0ac755915ceb79811831c Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Tue, 14 May 2019 15:42:31 -0700
Subject: kernel/latencytop.c: remove unnecessary checks for latencytop_enabled

1. In latencytop source codes, we only have such calling chain:

account_scheduler_latency(struct task_struct *task, int usecs, int inter)
{
        if (unlikely(latencytop_enabled)) /* the outtermost check */
                __account_scheduler_latency(task, usecs, inter);
}
__account_scheduler_latency
    account_global_scheduler_latency
        if (!latencytop_enabled)

So, the inner check for latencytop_enabled is not necessary at all.

2. In clear_all_latency_tracing and now is called
   clear_tsk_latency_tracing the check for latencytop_enabled is redundant
   and buggy to some extent.

   We have no reason to refuse clearing the /proc/$pid/latency if
   latencytop_enabled is set to 0, considering that if we use latencytop
   manually by echo 0 > /proc/sys/kernel/latencytop, then we want to clear
   /proc/$pid/latency and failed.

   Also we don't have such check in brother function
   clear_global_latency_tracing.

Notes: These changes are only visible to users who set
   CONFIG_LATENCYTOP and won't change user tool latencytop's behavior.

Link: http://lkml.kernel.org/r/20190226114602.16902-2-linf@wangsu.com
Signed-off-by: Lin Feng <linf@wangsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/latencytop.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..bbde5614da71 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -71,9 +71,6 @@ void clear_all_latency_tracing(struct task_struct *p)
 {
 	unsigned long flags;
 
-	if (!latencytop_enabled)
-		return;
-
 	raw_spin_lock_irqsave(&latency_lock, flags);
 	memset(&p->latency_record, 0, sizeof(p->latency_record));
 	p->latency_record_count = 0;
@@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
 	int firstnonnull = MAXLR + 1;
 	int i;
 
-	if (!latencytop_enabled)
-		return;
-
 	/* skip kernel threads for now */
 	if (!tsk->mm)
 		return;
-- 
cgit v1.2.3


From e02c9b0d65a7493180db45320f82482c6ba8ea57 Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Tue, 14 May 2019 15:42:34 -0700
Subject: kernel/latencytop.c: rename clear_all_latency_tracing to
 clear_tsk_latency_tracing

The name clear_all_latency_tracing is misleading, in fact which only
clear per task's latency_record[], and we do have another function named
clear_global_latency_tracing which clear the global latency_record[]
buffer.

Link: http://lkml.kernel.org/r/20190226114602.16902-1-linf@wangsu.com
Signed-off-by: Lin Feng <linf@wangsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c       | 2 +-
 kernel/latencytop.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b409e792aadc..b4cba953040a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2093,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
-	clear_all_latency_tracing(p);
+	clear_tsk_latency_tracing(p);
 
 	/* ok, now we should be set up.. */
 	p->pid = pid_nr(pid);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index bbde5614da71..871734ea2f04 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -67,7 +67,7 @@ static struct latency_record latency_record[MAXLR];
 
 int latencytop_enabled;
 
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 6c4e121fda519e0da5c6755a60fdef8cd39634ae Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 May 2019 15:42:37 -0700
Subject: kernel/user.c: clean up some leftover code

The out_unlock label is misleading; no unlocking happens after it, so
just return NULL directly.

Also, nothing between the kmem_cache_zalloc() that creates new and the
two key_put() can initialize new->uid_keyring or new->session_keyring,
so those calls are no-ops.

Link: http://lkml.kernel.org/r/20190424200404.9114-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..88b834f0eebc 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 	if (!up) {
 		new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
 		if (!new)
-			goto out_unlock;
+			return NULL;
 
 		new->uid = uid;
 		refcount_set(&new->__count, 1);
@@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
 		} else {
 			uid_hash_insert(new, hashent);
@@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid)
 	}
 
 	return up;
-
-out_unlock:
-	return NULL;
 }
 
 static int __init uid_cache_init(void)
-- 
cgit v1.2.3


From b028fb612849add771679c1b99a50d99264c9632 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 14 May 2019 15:44:35 -0700
Subject: kernel/signal.c: annotate implicit fall through

There is a plan to build the kernel with -Wimplicit-fallthrough and this
place in the code produced a warning (W=1).

This commit remove the following warning:

  kernel/signal.c:795:13: warning: this statement may fall through [-Wimplicit-fallthrough=]

Link: http://lkml.kernel.org/r/20190114203505.17875-1-malat@debian.org
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 62f9aea4a15a..c4dd66436fc5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
 			 */
 			if (!sid || sid == task_session(current))
 				break;
+			/* fall through */
 		default:
 			return -EPERM;
 		}
-- 
cgit v1.2.3


From 475dae385497dde3f25271ce77b526a1e54a472a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 14 May 2019 15:44:52 -0700
Subject: kernel/sysctl.c: switch to bitmap_zalloc()

Switch to bitmap_zalloc() to show clearly what we are allocating.
Besides that it returns pointer of bitmap type instead of opaque void *.

Link: http://lkml.kernel.org/r/20190304094037.57756-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba158f61aab4..d82f9161adb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3178,9 +3178,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 
-		tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
-				     sizeof(unsigned long),
-				     GFP_KERNEL);
+		tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
 		if (!tmp_bitmap) {
 			kfree(kbuf);
 			return -ENOMEM;
@@ -3271,7 +3269,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		*ppos += *lenp;
 	}
 
-	kfree(tmp_bitmap);
+	bitmap_free(tmp_bitmap);
 	return err;
 }
 
-- 
cgit v1.2.3


From e260ad01f0aa9e96b5386d5cd7184afd949dc457 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 14 May 2019 15:44:55 -0700
Subject: sysctl: return -EINVAL if val violates minmax

Currently when userspace gives us a values that overflow e.g.  file-max
and other callers of __do_proc_doulongvec_minmax() we simply ignore the
new value and leave the current value untouched.

This can be problematic as it gives the illusion that the limit has
indeed be bumped when in fact it failed.  This commit makes sure to
return EINVAL when an overflow is detected.  Please note that this is a
userspace facing change.

Link: http://lkml.kernel.org/r/20190210203943.8227-4-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d82f9161adb8..f7bd1aead3bf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2886,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 			if (neg)
 				continue;
 			val = convmul * val / convdiv;
-			if ((min && val < *min) || (max && val > *max))
-				continue;
+			if ((min && val < *min) || (max && val > *max)) {
+				err = -EINVAL;
+				break;
+			}
 			*i = val;
 		} else {
 			val = convdiv * (*i) / convmul;
-- 
cgit v1.2.3


From 3116ad38f51c98c81175151bd7358858a92a6031 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 14 May 2019 15:45:13 -0700
Subject: kernel/sysctl.c: fix proc_do_large_bitmap for large input buffers

Today, proc_do_large_bitmap() truncates a large write input buffer to
PAGE_SIZE - 1, which may result in misparsed numbers at the (truncated)
end of the buffer.  Further, it fails to notify the caller that the
buffer was truncated, so it doesn't get called iteratively to finish the
entire input buffer.

Tell the caller if there's more work to do by adding the skipped amount
back to left/*lenp before returning.

To fix the misparsing, reset the position if we have completely consumed
a truncated buffer (or if just one char is left, which may be a "-" in a
range), and ask the caller to come back for more.

Link: http://lkml.kernel.org/r/20190320222831.8243-7-mcgrof@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f7bd1aead3bf..943c89178e3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3172,9 +3172,13 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 	if (write) {
 		char *kbuf, *p;
+		size_t skipped = 0;
 
-		if (left > PAGE_SIZE - 1)
+		if (left > PAGE_SIZE - 1) {
 			left = PAGE_SIZE - 1;
+			/* How much of the buffer we'll skip this pass */
+			skipped = *lenp - left;
+		}
 
 		p = kbuf = memdup_user_nul(buffer, left);
 		if (IS_ERR(kbuf))
@@ -3189,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		while (!err && left) {
 			unsigned long val_a, val_b;
 			bool neg;
+			size_t saved_left;
 
+			/* In case we stop parsing mid-number, we can reset */
+			saved_left = left;
 			err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
 					     sizeof(tr_a), &c);
+			/*
+			 * If we consumed the entirety of a truncated buffer or
+			 * only one char is left (may be a "-"), then stop here,
+			 * reset, & come back for more.
+			 */
+			if ((left <= 1) && skipped) {
+				left = saved_left;
+				break;
+			}
+
 			if (err)
 				break;
 			if (val_a >= bitmap_len || neg) {
@@ -3209,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 				err = proc_get_long(&p, &left, &val_b,
 						     &neg, tr_b, sizeof(tr_b),
 						     &c);
+				/*
+				 * If we consumed all of a truncated buffer or
+				 * then stop here, reset, & come back for more.
+				 */
+				if (!left && skipped) {
+					left = saved_left;
+					break;
+				}
+
 				if (err)
 					break;
 				if (val_b >= bitmap_len || neg ||
@@ -3227,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			proc_skip_char(&p, &left, '\n');
 		}
 		kfree(kbuf);
+		left += skipped;
 	} else {
 		unsigned long bit_a, bit_b = 0;
 
-- 
cgit v1.2.3


From 1fd402df4586bcc239298081449ce58a78211626 Mon Sep 17 00:00:00 2001
From: Timmy Li <scuttimmy@gmail.com>
Date: Tue, 14 May 2019 15:45:16 -0700
Subject: kernel/pid.c: remove unneeded hash header file

Hash functions are not needed since idr is used now.  Let's remove hash
header file for cleanup.

Link: http://lkml.kernel.org/r/20190430053319.95913-1-scuttimmy@gmail.com
Signed-off-by: Timmy Li <scuttimmy@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: KJ Tsanaktsidis <ktsanaktsidis@zendesk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..89548d35eefb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <linux/memblock.h>
-#include <linux/hash.h>
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
-- 
cgit v1.2.3


From 826eba0d77bc74c4d1c611374b76abfe251e8538 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@android.com>
Date: Tue, 14 May 2019 15:45:25 -0700
Subject: gcov: clang: move common GCC code into gcc_base.c

Patch series "gcov: add Clang support", v4.

This patch (of 3):

base.c contains a few callbacks specific to GCC's gcov implementation.
Move these into their own module in preparation for Clang support.

Link: http://lkml.kernel.org/r/20190318025411.98014-2-trong@android.com
Signed-off-by: Greg Hackmann <ghackmann@android.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Tri Vo <trong@android.com>
Tested-by: Trilok Soni <tsoni@quicinc.com>
Tested-by: Prasad Sodagudi <psodagud@quicinc.com>
Tested-by: Tri Vo <trong@android.com>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Petri Gynther <pgynther@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/gcov/Makefile   |  4 +--
 kernel/gcov/base.c     | 84 ++----------------------------------------------
 kernel/gcov/gcc_base.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/gcov/gcov.h     |  3 ++
 4 files changed, 93 insertions(+), 84 deletions(-)
 create mode 100644 kernel/gcov/gcc_base.c

(limited to 'kernel')

diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..45431ed679d1 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,5 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 
 obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..799d42072727 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
 #include <linux/sched.h>
 #include "gcov.h"
 
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
-	static unsigned int gcov_version;
-
-	mutex_lock(&gcov_lock);
-	if (gcov_version == 0) {
-		gcov_version = gcov_info_version(info);
-		/*
-		 * Printing gcc's version magic may prove useful for debugging
-		 * incompatibility reports.
-		 */
-		pr_info("version magic: 0x%x\n", gcov_version);
-	}
-	/*
-	 * Add new profiling data structure to list and inform event
-	 * listener.
-	 */
-	gcov_info_link(info);
-	if (gcov_events_enabled)
-		gcov_event(GCOV_ADD, info);
-	mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
 
 /**
  * gcov_enable_events - enable event reporting through gcov_event()
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+	static unsigned int gcov_version;
+
+	mutex_lock(&gcov_lock);
+	if (gcov_version == 0) {
+		gcov_version = gcov_info_version(info);
+		/*
+		 * Printing gcc's version magic may prove useful for debugging
+		 * incompatibility reports.
+		 */
+		pr_info("version magic: 0x%x\n", gcov_version);
+	}
+	/*
+	 * Add new profiling data structure to list and inform event
+	 * listener.
+	 */
+	gcov_info_link(info);
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..0ecf1d664ec3 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -83,4 +83,7 @@ struct gcov_link {
 };
 extern const struct gcov_link gcov_link[];
 
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
 #endif /* GCOV_H */
-- 
cgit v1.2.3


From e178a5beb36960902379040ee0b667fb0a8eee93 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@android.com>
Date: Tue, 14 May 2019 15:45:31 -0700
Subject: gcov: clang support

LLVM uses profiling data that's deliberately similar to GCC, but has a
very different way of exporting that data.  LLVM calls llvm_gcov_init()
once per module, and provides a couple of callbacks that we can use to
ask for more data.

We care about the "writeout" callback, which in turn calls back into
compiler-rt/this module to dump all the gathered coverage data to disk:

   llvm_gcda_start_file()
     llvm_gcda_emit_function()
     llvm_gcda_emit_arcs()
     llvm_gcda_emit_function()
     llvm_gcda_emit_arcs()
     [... repeats for each function ...]
   llvm_gcda_summary_info()
   llvm_gcda_end_file()

This design is much more stateless and unstructured than gcc's, and is
intended to run at process exit.  This forces us to keep some local
state about which module we're dealing with at the moment.  On the other
hand, it also means we don't depend as much on how LLVM represents
profiling data internally.

See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
details on how this works, particularly GCOVProfiler::emitProfileArcs(),
GCOVProfiler::insertCounterWriteout(), and GCOVProfiler::insertFlush().

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190417225328.208129-1-trong@android.com
Signed-off-by: Greg Hackmann <ghackmann@android.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Tri Vo <trong@android.com>
Co-developed-by: Nick Desaulniers <ndesaulniers@google.com>
Co-developed-by: Tri Vo <trong@android.com>
Tested-by: Trilok Soni <tsoni@quicinc.com>
Tested-by: Prasad Sodagudi <psodagud@quicinc.com>
Tested-by: Tri Vo <trong@android.com>
Tested-by: Daniel Mentz <danielmentz@google.com>
Tested-by: Petri Gynther <pgynther@google.com>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/gcov/Kconfig   |   3 +-
 kernel/gcov/Makefile  |   1 +
 kernel/gcov/base.c    |   2 +-
 kernel/gcov/clang.c   | 581 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/gcov/gcc_3_4.c |  12 ++
 kernel/gcov/gcc_4_7.c |  12 ++
 kernel/gcov/gcov.h    |   2 +
 7 files changed, 611 insertions(+), 2 deletions(-)
 create mode 100644 kernel/gcov/clang.c

(limited to 'kernel')

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..f71c1adcff31 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL
 choice
 	prompt "Specify GCOV format"
 	depends on GCOV_KERNEL
+	depends on CC_IS_GCC
 	---help---
 	The gcov format is usually determined by the GCC version, and the
 	default is chosen according to your GCC version. However, there are
@@ -62,7 +63,7 @@ choice
 
 config GCOV_FORMAT_3_4
 	bool "GCC 3.4 format"
-	depends on CC_IS_GCC && GCC_VERSION < 40700
+	depends on GCC_VERSION < 40700
 	---help---
 	Select this option to use the format defined by GCC 3.4.
 
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 45431ed679d1..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -4,3 +4,4 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-y := base.o fs.o
 obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
 obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 799d42072727..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -64,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
 
 	/* Remove entries located in module from linked list. */
 	while ((info = gcov_info_next(info))) {
-		if (within_module((unsigned long)info, mod)) {
+		if (gcov_info_within_module(info, mod)) {
 			gcov_info_unlink(prev, info);
 			if (gcov_events_enabled)
 				gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data.  LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ *    llvm_gcda_start_file()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      [... repeats for each function ...]
+ *    llvm_gcda_summary_info()
+ *    llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit.  This forces us to keep some local state
+ * about which module we're dealing with at the moment.  On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+	struct list_head head;
+
+	const char *filename;
+	unsigned int version;
+	u32 checksum;
+
+	struct list_head functions;
+};
+
+struct gcov_fn_info {
+	struct list_head head;
+
+	u32 ident;
+	u32 checksum;
+	u8 use_extra_checksum;
+	u32 cfg_checksum;
+
+	u32 num_counters;
+	u64 *counters;
+	const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+	struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	INIT_LIST_HEAD(&info->functions);
+
+	mutex_lock(&gcov_lock);
+
+	list_add_tail(&info->head, &clang_gcov_list);
+	current_info = info;
+	writeout();
+	current_info = NULL;
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+		u32 checksum)
+{
+	current_info->filename = orig_filename;
+	memcpy(&current_info->version, version, sizeof(current_info->version));
+	current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+		u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+	struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	info->ident = ident;
+	info->checksum = func_checksum;
+	info->use_extra_checksum = use_extra_checksum;
+	info->cfg_checksum = cfg_checksum;
+	if (function_name)
+		info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+	list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+	struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+			struct gcov_fn_info, head);
+
+	info->num_counters = num_counters;
+	info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+	return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+	return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+	if (!info)
+		return list_first_entry_or_null(&clang_gcov_list,
+				struct gcov_info, head);
+	if (list_is_last(&info->head, &clang_gcov_list))
+		return NULL;
+	return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+	list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+	/* Generic code unlinks while iterating. */
+	__list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
+	{ 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn;
+
+	list_for_each_entry(fn, &info->functions, head)
+		memset(fn->counters, 0,
+				sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+	struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+			&info1->functions, struct gcov_fn_info, head);
+	struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+			&info2->functions, struct gcov_fn_info, head);
+
+	if (info1->checksum != info2->checksum)
+		return false;
+	if (!fn_ptr1)
+		return fn_ptr1 == fn_ptr2;
+	while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+		!list_is_last(&fn_ptr2->head, &info2->functions)) {
+		if (fn_ptr1->checksum != fn_ptr2->checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum &&
+			fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+			return false;
+		fn_ptr1 = list_next_entry(fn_ptr1, head);
+		fn_ptr2 = list_next_entry(fn_ptr2, head);
+	}
+	return list_is_last(&fn_ptr1->head, &info1->functions) &&
+		list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+	struct gcov_fn_info *dfn_ptr;
+	struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+			struct gcov_fn_info, head);
+
+	list_for_each_entry(dfn_ptr, &dst->functions, head) {
+		u32 i;
+
+		for (i = 0; i < sfn_ptr->num_counters; i++)
+			dfn_ptr->counters[i] += sfn_ptr->counters[i];
+	}
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+	size_t cv_size; /* counter values size */
+	struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+			GFP_KERNEL);
+	if (!fn_dup)
+		return NULL;
+	INIT_LIST_HEAD(&fn_dup->head);
+
+	fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+	if (!fn_dup->function_name)
+		goto err_name;
+
+	cv_size = fn->num_counters * sizeof(fn->counters[0]);
+	fn_dup->counters = vmalloc(cv_size);
+	if (!fn_dup->counters)
+		goto err_counters;
+	memcpy(fn_dup->counters, fn->counters, cv_size);
+
+	return fn_dup;
+
+err_counters:
+	kfree(fn_dup->function_name);
+err_name:
+	kfree(fn_dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+	struct gcov_info *dup;
+	struct gcov_fn_info *fn;
+
+	dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	INIT_LIST_HEAD(&dup->head);
+	INIT_LIST_HEAD(&dup->functions);
+	dup->filename = kstrdup(info->filename, GFP_KERNEL);
+	if (!dup->filename)
+		goto err;
+
+	list_for_each_entry(fn, &info->functions, head) {
+		struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+		if (!fn_dup)
+			goto err;
+		list_add_tail(&fn_dup->head, &dup->functions);
+	}
+
+	return dup;
+
+err:
+	gcov_info_free(dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn, *tmp;
+
+	list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+		kfree(fn->function_name);
+		vfree(fn->counters);
+		list_del(&fn->head);
+		kfree(fn);
+	}
+	kfree(info->filename);
+	kfree(info);
+}
+
+#define ITER_STRIDE	PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+	struct gcov_info *info;
+	void *buffer;
+	size_t size;
+	loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+		*data = v;
+	}
+
+	return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+
+		data[0] = (v & 0xffffffffUL);
+		data[1] = (v >> 32);
+	}
+
+	return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+	struct gcov_fn_info *fi_ptr;
+	size_t pos = 0;
+
+	/* File header. */
+	pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+	pos += store_gcov_u32(buffer, pos, info->version);
+	pos += store_gcov_u32(buffer, pos, info->checksum);
+
+	list_for_each_entry(fi_ptr, &info->functions, head) {
+		u32 i;
+		u32 len = 2;
+
+		if (fi_ptr->use_extra_checksum)
+			len++;
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+		pos += store_gcov_u32(buffer, pos, len);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+		if (fi_ptr->use_extra_checksum)
+			pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+		for (i = 0; i < fi_ptr->num_counters; i++)
+			pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+	}
+
+	return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+	struct gcov_iterator *iter;
+
+	iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+	if (!iter)
+		goto err_free;
+
+	iter->info = info;
+	/* Dry-run to get the actual buffer size. */
+	iter->size = convert_to_gcda(NULL, info);
+	iter->buffer = vmalloc(iter->size);
+	if (!iter->buffer)
+		goto err_free;
+
+	convert_to_gcda(iter->buffer, info);
+
+	return iter;
+
+err_free:
+	kfree(iter);
+	return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+	vfree(iter->buffer);
+	kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+	return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+	iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+	if (iter->pos < iter->size)
+		iter->pos += ITER_STRIDE;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+	size_t len;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	len = ITER_STRIDE;
+	if (iter->pos + len > iter->size)
+		len = iter->size - iter->pos;
+
+	seq_write(seq, iter->buffer + iter->pos, len);
+
+	return 0;
+}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 0ecf1d664ec3..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
 #ifndef GCOV_H
 #define GCOV_H GCOV_H
 
+#include <linux/module.h>
 #include <linux/types.h>
 
 /*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
 struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
 
 /* Base interface. */
 enum gcov_action {
-- 
cgit v1.2.3


From c39ea0b9dd24bf1bf2baa5cdbfa1905f3065347b Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 14 May 2019 15:45:34 -0700
Subject: panic: avoid the extra noise dmesg

When kernel panic happens, it will first print the panic call stack,
then the ending msg like:

[   35.743249] ---[ end Kernel panic - not syncing: Fatal exception
[   35.749975] ------------[ cut here ]------------

The above message are very useful for debugging.

But if system is configured to not reboot on panic, say the
"panic_timeout" parameter equals 0, it will likely print out many noisy
message like WARN() call stack for each and every CPU except the panic
one, messages like below:

	WARNING: CPU: 1 PID: 280 at kernel/sched/core.c:1198 set_task_cpu+0x183/0x190
	Call Trace:
	<IRQ>
	try_to_wake_up
	default_wake_function
	autoremove_wake_function
	__wake_up_common
	__wake_up_common_lock
	__wake_up
	wake_up_klogd_work_func
	irq_work_run_list
	irq_work_tick
	update_process_times
	tick_sched_timer
	__hrtimer_run_queues
	hrtimer_interrupt
	smp_apic_timer_interrupt
	apic_timer_interrupt

For people working in console mode, the screen will first show the panic
call stack, but immediately overridden by these noisy extra messages,
which makes debugging much more difficult, as the original context gets
lost on screen.

Also these noisy messages will confuse some users, as I have seen many bug
reporters posted the noisy message into bugzilla, instead of the real
panic call stack and context.

Adding a flag "suppress_printk" which gets set in panic() to avoid those
noisy messages, without changing current kernel behavior that both panic
blinking and sysrq magic key can work as is, suggested by Petr Mladek.

To verify this, make sure kernel is not configured to reboot on panic and
in console
 # echo c > /proc/sysrq-trigger
to see if console only prints out the panic call stack.

Link: http://lkml.kernel.org/r/1551430186-24169-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c         |  3 +++
 kernel/printk/printk.c | 10 ++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index c1fcaad337b7..a6145050a8da 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -321,6 +321,9 @@ void panic(const char *fmt, ...)
 	disabled_wait();
 #endif
 	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+
+	/* Do not scroll important messages printed above */
+	suppress_printk = 1;
 	local_irq_enable();
 	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02ca827b8fac..17102fd4c136 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
 
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
+ */
+int __read_mostly suppress_printk;
+
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
 	.name = "console_lock"
@@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	unsigned long flags;
 	u64 curr_log_seq;
 
+	/* Suppress unimportant messages after panic happens */
+	if (unlikely(suppress_printk))
+		return 0;
+
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
 		in_sched = true;
-- 
cgit v1.2.3


From b287a25a7148a89d977c819c1f7d6584f875b682 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Tue, 14 May 2019 15:45:37 -0700
Subject: panic/reboot: allow specifying reboot_mode for panic only

Allow specifying reboot_mode for panic only.  This is needed on systems
where ramoops is used to store panic logs, and user wants to use warm
reset to preserve those, while still having cold reset on normal
reboots.

Link: http://lkml.kernel.org/r/20190322004735.27702-1-aaro.koskinen@iki.fi
Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c  |  2 ++
 kernel/reboot.c | 20 +++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index a6145050a8da..8779d64bace0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -306,6 +306,8 @@ void panic(const char *fmt, ...)
 		 * shutting down.  But if there is a chance of
 		 * rebooting the system it will be rebooted.
 		 */
+		if (panic_reboot_mode != REBOOT_UNDEFINED)
+			reboot_mode = panic_reboot_mode;
 		emergency_restart();
 	}
 #ifdef __sparc__
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..b9e79e8c7226 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid);
 #define DEFAULT_REBOOT_MODE
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
 
 /*
  * This variable is used privately to keep track of whether or not
@@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
+		enum reboot_mode *mode;
+
 		/*
 		 * Having anything passed on the command line via
 		 * reboot= will cause us to disable DMI checking
@@ -526,17 +529,24 @@ static int __init reboot_setup(char *str)
 		 */
 		reboot_default = 0;
 
+		if (!strncmp(str, "panic_", 6)) {
+			mode = &panic_reboot_mode;
+			str += 6;
+		} else {
+			mode = &reboot_mode;
+		}
+
 		switch (*str) {
 		case 'w':
-			reboot_mode = REBOOT_WARM;
+			*mode = REBOOT_WARM;
 			break;
 
 		case 'c':
-			reboot_mode = REBOOT_COLD;
+			*mode = REBOOT_COLD;
 			break;
 
 		case 'h':
-			reboot_mode = REBOOT_HARD;
+			*mode = REBOOT_HARD;
 			break;
 
 		case 's':
@@ -553,11 +563,11 @@ static int __init reboot_setup(char *str)
 				if (rc)
 					return rc;
 			} else
-				reboot_mode = REBOOT_SOFT;
+				*mode = REBOOT_SOFT;
 			break;
 		}
 		case 'g':
-			reboot_mode = REBOOT_GPIO;
+			*mode = REBOOT_GPIO;
 			break;
 
 		case 'b':
-- 
cgit v1.2.3


From 89963adcdb430e047f4c03ac3ed6ce9aa42a595c Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 15 May 2019 15:23:52 +1000
Subject: kernel/compat.c: mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

This patch aims to suppress 3 missing-break-in-switch false positives
on some architectures.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Gustavo A. R. Silva <gustavo@embeddedor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/compat.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index d8a36c6ad7c9..b5f7063c0db6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -346,8 +346,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 		return -EFAULT;
 	switch (_NSIG_WORDS) {
 	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+		/* fall through */
 	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+		/* fall through */
 	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+		/* fall through */
 	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
 	}
 #else
-- 
cgit v1.2.3


From 05b289263772b0698589abc47771264a685cd365 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 16 May 2019 10:38:21 -0700
Subject: signal: unconditionally leave the frozen state in ptrace_stop()

Alex Xu reported a regression in strace, caused by the introduction of
the cgroup v2 freezer. The regression can be reproduced by stracing
the following simple program:

  #include <unistd.h>

  int main() {
      write(1, "a", 1);
      return 0;
  }

An attempt to run strace ./a.out leads to the infinite loop:
  [ pre-main omitted ]
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  write(1, "a", 1)                        = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
  [ repeats forever ]

The problem occurs because the traced task leaves ptrace_stop()
(and the signal handling loop) with the frozen bit set. So let's
call cgroup_leave_frozen(true) unconditionally after sleeping
in ptrace_stop().

With this patch applied, strace works as expected:
  [ pre-main omitted ]
  write(1, "a", 1)                        = 1
  exit_group(0)                           = ?
  +++ exited with 0 +++

Reported-by: Alex Xu <alex_y_xu@yahoo.ca>
Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer")
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c4dd66436fc5..a1eb44dc9ff5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2113,6 +2113,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		preempt_enable_no_resched();
 		cgroup_enter_frozen();
 		freezable_schedule();
+		cgroup_leave_frozen(true);
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
-- 
cgit v1.2.3


From e547ff3f803e779a3898f1f48447b29f43c54085 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Tue, 14 May 2019 19:42:57 -0700
Subject: bpf: relax inode permission check for retrieving bpf program

For iptable module to load a bpf program from a pinned location, it
only retrieve a loaded program and cannot change the program content so
requiring a write permission for it might not be necessary.
Also when adding or removing an unrelated iptable rule, it might need to
flush and reload the xt_bpf related rules as well and triggers the inode
permission check. It might be better to remove the write premission
check for the inode so we won't need to grant write access to all the
processes that flush and restore iptables rules.

Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bc53e5b20ddc..84a80b02db99 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -518,7 +518,7 @@ out:
 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
 {
 	struct bpf_prog *prog;
-	int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+	int ret = inode_permission(inode, MAY_READ);
 	if (ret)
 		return ERR_PTR(ret);
 
-- 
cgit v1.2.3


From de6da1e8bcf0dd2058b950b127491821207679dc Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 17 May 2019 14:31:50 -0700
Subject: panic: add an option to replay all the printk message in buffer

Currently on panic, kernel will lower the loglevel and print out pending
printk msg only with console_flush_on_panic().

Add an option for users to configure the "panic_print" to replay all
dmesg in buffer, some of which they may have never seen due to the
loglevel setting, which will help panic debugging .

[feng.tang@intel.com: keep the original console_flush_on_panic() inside panic()]
  Link: http://lkml.kernel.org/r/1556199137-14163-1-git-send-email-feng.tang@intel.com
[feng.tang@intel.com: use logbuf lock to protect the console log index]
  Link: http://lkml.kernel.org/r/1556269868-22654-1-git-send-email-feng.tang@intel.com
Link: http://lkml.kernel.org/r/1556095872-36838-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Aaro Koskinen <aaro.koskinen@nokia.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c         |  6 +++++-
 kernel/printk/printk.c | 12 +++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8779d64bace0..b4543a31a495 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_TIMER_INFO		0x00000004
 #define PANIC_PRINT_LOCK_INFO		0x00000008
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
+#define PANIC_PRINT_ALL_PRINTK_MSG	0x00000020
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic);
 
 static void panic_print_sys_info(void)
 {
+	if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+		console_flush_on_panic(CONSOLE_REPLAY_ALL);
+
 	if (panic_print & PANIC_PRINT_TASK_INFO)
 		show_state();
 
@@ -277,7 +281,7 @@ void panic(const char *fmt, ...)
 	 * panic() is not being callled from OOPS.
 	 */
 	debug_locks_off();
-	console_flush_on_panic();
+	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
 
 	panic_print_sys_info();
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 17102fd4c136..a6e06fe38e41 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2535,10 +2535,11 @@ void console_unblank(void)
 
 /**
  * console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
  *
  * Immediately output all pending messages no matter what.
  */
-void console_flush_on_panic(void)
+void console_flush_on_panic(enum con_flush_mode mode)
 {
 	/*
 	 * If someone else is holding the console lock, trylock will fail
@@ -2549,6 +2550,15 @@ void console_flush_on_panic(void)
 	 */
 	console_trylock();
 	console_may_schedule = 0;
+
+	if (mode == CONSOLE_REPLAY_ALL) {
+		unsigned long flags;
+
+		logbuf_lock_irqsave(flags);
+		console_seq = log_first_seq;
+		console_idx = log_first_idx;
+		logbuf_unlock_irqrestore(flags);
+	}
 	console_unlock();
 }
 
-- 
cgit v1.2.3


From 7eaf51a2e094229b75cc0c315f1cbbe2f3960058 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 May 2019 14:51:17 -0400
Subject: stacktrace: Unbreak stack_trace_save_tsk_reliable()

Miroslav reported that the livepatch self-tests were failing, specifically
a case in which the consistency model ensures that a current executing
function is not allowed to be patched, "TEST: busy target module".

Recent renovations of stack_trace_save_tsk_reliable() left it returning
only an -ERRNO success indication in some configuration combinations:

  klp_check_stack()
    ret = stack_trace_save_tsk_reliable()
      #ifdef CONFIG_ARCH_STACKWALK && CONFIG_HAVE_RELIABLE_STACKTRACE
        stack_trace_save_tsk_reliable()
          ret = arch_stack_walk_reliable()
            return 0
            return -EINVAL
          ...
          return ret;
    ...
    if (ret < 0)
      /* stack_trace_save_tsk_reliable error */
    nr_entries = ret;                               << 0

Previously (and currently for !CONFIG_ARCH_STACKWALK &&
CONFIG_HAVE_RELIABLE_STACKTRACE) stack_trace_save_tsk_reliable() returned
the number of entries that it consumed in the passed storage array.

In the case of the above config and trace, be sure to return the
stacktrace_cookie.len on stack_trace_save_tsk_reliable() success.

Fixes: 25e39e32b0a3f ("livepatch: Simplify stack trace retrieval")
Reported-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: live-patching@vger.kernel.org
Cc: jikos@kernel.org
Cc: pmladek@suse.com
Link: https://lkml.kernel.org/r/20190517185117.24642-1-joe.lawrence@redhat.com
---
 kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 27bafc1e271e..90d3e0bf0302 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -206,7 +206,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
 
 	ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
 	put_task_stack(tsk);
-	return ret;
+	return ret ? ret : c.len;
 }
 #endif
 
-- 
cgit v1.2.3


From 457c89965399115e5cd8bf38f9c597293405703d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 May 2019 13:08:55 +0100
Subject: treewide: Add SPDX license identifier for missed files

Add SPDX license identifiers to all files which:

 - Have no license information of any form

 - Have EXPORT_.*_SYMBOL_GPL inside which was used in the
   initial scan/conversion to ignore the file

These files fall under the project license, GPL v2 only. The resulting SPDX
license identifier is:

  GPL-2.0-only

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/tnum.c             | 1 +
 kernel/cgroup/cgroup-v1.c     | 1 +
 kernel/cgroup/rstat.c         | 1 +
 kernel/context_tracking.c     | 1 +
 kernel/crash_dump.c           | 1 +
 kernel/dma/swiotlb.c          | 1 +
 kernel/exit.c                 | 1 +
 kernel/fork.c                 | 1 +
 kernel/freezer.c              | 1 +
 kernel/hung_task.c            | 1 +
 kernel/irq_work.c             | 1 +
 kernel/jump_label.c           | 1 +
 kernel/kallsyms.c             | 1 +
 kernel/kthread.c              | 1 +
 kernel/locking/lockdep.c      | 1 +
 kernel/locking/mutex.c        | 1 +
 kernel/locking/percpu-rwsem.c | 1 +
 kernel/locking/rtmutex.c      | 1 +
 kernel/notifier.c             | 1 +
 kernel/panic.c                | 1 +
 kernel/pid.c                  | 1 +
 kernel/pid_namespace.c        | 1 +
 kernel/power/qos.c            | 1 +
 kernel/printk/printk.c        | 1 +
 kernel/profile.c              | 1 +
 kernel/ptrace.c               | 1 +
 kernel/reboot.c               | 1 +
 kernel/resource.c             | 1 +
 kernel/sched/clock.c          | 1 +
 kernel/sched/core.c           | 1 +
 kernel/sched/cputime.c        | 1 +
 kernel/sched/idle.c           | 1 +
 kernel/sched/isolation.c      | 1 +
 kernel/sched/wait.c           | 1 +
 kernel/sched/wait_bit.c       | 1 +
 kernel/signal.c               | 1 +
 kernel/smp.c                  | 1 +
 kernel/smpboot.c              | 1 +
 kernel/stacktrace.c           | 1 +
 kernel/sysctl.c               | 1 +
 kernel/umh.c                  | 1 +
 kernel/up.c                   | 1 +
 kernel/user-return-notifier.c | 1 +
 kernel/user.c                 | 1 +
 kernel/workqueue.c            | 1 +
 45 files changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* tnum: tracked (or tristate) numbers
  *
  * A tnum tracks knowledge about the bits of a value.  Each bit can be either
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 68ca5de7ec27..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include "cgroup-internal.h"
 
 #include <linux/ctype.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bb95a35e8c2d..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include "cgroup-internal.h"
 
 #include <linux/sched/cputime.h>
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Context tracking: Probe on high level context boundaries such as kernel
  * and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kernel.h>
 #include <linux/crash_dump.h>
 #include <linux/init.h>
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6f7619c1f877..13f0cb080a4d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Dynamic DMA mapping support.
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index 8361a560cd1d..1803efb2922f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/exit.c
  *
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..b2b87d450b80 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/fork.c
  *
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/freezer.c - Function to freeze a process
  *
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95882c6..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Detect Hung Task
  *
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73288914ed5e..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
  *
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index de6efdecc70d..0bfa10f4410c 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * jump label support
  *
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 14934afa9e68..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be4e8795561a..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Kernel thread helper functions.
  *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
  *
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d06190fa5082..c47788fa85f9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/lockdep.c
  *
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/locking/mutex.c
  *
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..b6a9cc62099a 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 978d63a8261c..38fbf9fa7f1b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * RT-Mutexes: simple blocking mutual exclusion locks with PI support
  *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index bfc95b3e4235..d9f5081d578d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
 #include <linux/export.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index b4543a31a495..4d9f55bf7d38 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/panic.c
  *
diff --git a/kernel/pid.c b/kernel/pid.c
index 89548d35eefb..e5cad0c7d5dd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Generic pidhash and scalable, time-bounded PID allocator
  *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index aa6e72fb7c08..f54bc7cb6c2d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Pid namespaces
  *
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d22131afc1e..33e3febaba53 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * This module exposes the interface to kernel space for specifying
  * QoS dependencies.  It provides infrastructure for registration of:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a6e06fe38e41..1888f6a3b694 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/printk.c
  *
diff --git a/kernel/profile.c b/kernel/profile.c
index 9c08a2c7cb1d..af7c94bf5fa1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/profile.c
  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6f357f4fc859..5710d07e67cf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/kernel/ptrace.c
  *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b9e79e8c7226..c4d472b7f1b4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/reboot.c
  *
diff --git a/kernel/resource.c b/kernel/resource.c
index 8c15f846e8ef..158f04ec1d4f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	linux/kernel/resource.c
  *
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e3e3b979f9bd..1152259a4ca0 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * sched_clock() for unstable CPU clocks
  *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102dfcf0a29a..874c427742a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  kernel/sched/core.c
  *
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba4a143bdcf3..2305ce89a26c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Simple CPU accounting cgroup controller
  */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..80940939b733 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Generic entry points for the idle threads and
  * implementation of the idle task scheduling class.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 687302051a27..123ea07a3f3b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Housekeeping management. Manage the targets for routine code that can run on
  *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6eb1f8efd221..fa0f9adfb752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Generic waiting primitives.
  *
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c67c6d24adc2..45eba18a2898 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index a1eb44dc9ff5..d7b9d14ac80d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/signal.c
  *
diff --git a/kernel/smp.c b/kernel/smp.c
index f4cf1b0bb3b8..d155374632eb 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Generic helpers for smp ipi calls
  *
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c230c2dd48e1..2efe1e206167 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Common SMP CPU bringup/teardown functions
  */
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 27bafc1e271e..5667f1da3ede 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/stacktrace.c
  *
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 943c89178e3d..7d1008be6173 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * sysctl.c: General linux system control interface
  *
diff --git a/kernel/umh.c b/kernel/umh.c
index d937cbad903a..7f255b5a8845 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * umh - the kernel usermode helper
  */
diff --git a/kernel/up.c b/kernel/up.c
index ff536f9cc8a2..483c9962c999 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Uniprocessor-only support functions.  The counterpart to kernel/smp.c
  */
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 9586b670a5b2..870ecd7c63ed 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
diff --git a/kernel/user.c b/kernel/user.c
index 88b834f0eebc..78b17e36e705 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * The "user cache".
  *
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..95aea04ff722 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/workqueue.c - generic async execution with shared worker pool
  *
-- 
cgit v1.2.3


From ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 May 2019 13:07:45 +0100
Subject: treewide: Add SPDX license identifier - Makefile/Kconfig

Add SPDX license identifiers to all Make/Kconfig files which:

 - Have no license information of any form

These files fall under the project license, GPL v2 only. The resulting SPDX
license identifier is:

  GPL-2.0-only

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/Kconfig.freezer    | 1 +
 kernel/Kconfig.hz         | 1 +
 kernel/Kconfig.locks      | 1 +
 kernel/Kconfig.preempt    | 1 +
 kernel/debug/Makefile     | 1 +
 kernel/dma/Kconfig        | 1 +
 kernel/gcov/Kconfig       | 1 +
 kernel/irq/Kconfig        | 1 +
 kernel/livepatch/Kconfig  | 1 +
 kernel/livepatch/Makefile | 1 +
 kernel/power/Kconfig      | 1 +
 kernel/printk/Makefile    | 1 +
 kernel/rcu/Kconfig        | 1 +
 kernel/rcu/Kconfig.debug  | 1 +
 kernel/time/Kconfig       | 1 +
 kernel/trace/Kconfig      | 1 +
 16 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FREEZER
 	def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Timer Interrupt Frequency Configuration
 #
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index bf770d7556f7..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # The ARCH_INLINE foo is necessary because select ignores "depends on"
 #
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 choice
 	prompt "Preemption Model"
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux kernel debugger
 #
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 83d711f8d665..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 
 config HAS_DMA
 	bool
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index f71c1adcff31..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menu "GCOV-based kernel profiling"
 
 config GCOV_KERNEL
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8fee06625c37..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menu "IRQ subsystem"
 # Options selectable by the architecture code
 
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HAVE_LIVEPATCH
 	bool
 	help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
 livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..ff8592ddedee 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on ARCH_SUSPEND_POSSIBLE
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4a2ffc39eb95..4d052fc6bcde 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-y	= printk.o
 obj-$(CONFIG_PRINTK)	+= printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 37301430970e..480edf328b51 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # RCU-related configuration options
 #
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # RCU-related debugging configuration options
 #
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e2c038d6c13c..fcc42353f125 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Timer subsystem related configuration options
 #
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d965cef6c77..564e5fdb025f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Architectures that offer an FUNCTION_TRACER implementation should
 #  select HAVE_FUNCTION_TRACER:
-- 
cgit v1.2.3


From d6cd1e9b9ff4f5e2d6b7085ad8601f86177fd300 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 May 2019 15:51:39 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 9

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it will be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details you
  should have received a copy of the gnu general public license along
  with this program if not you can access it online at http www gnu
  org licenses gpl 2 0 html

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 1 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Jilayne Lovejoy <opensource@jilayne.com>
Reviewed-by: Steve Winslow <swinslow@gmail.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190519154041.430943677@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/locking/test-ww_mutex.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..3e82f449b4ff 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Module-based API test facility for ww_mutexes
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 1ccea77e2a2687cae171b7987eb44730ec8c6d5f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 May 2019 15:51:43 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 13

Based on 2 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it will be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details you
  should have received a copy of the gnu general public license along
  with this program if not see http www gnu org licenses

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it will be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details [based]
  [from] [clk] [highbank] [c] you should have received a copy of the
  gnu general public license along with this program if not see http
  www gnu org licenses

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 355 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Jilayne Lovejoy <opensource@jilayne.com>
Reviewed-by: Steve Winslow <swinslow@gmail.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190519154041.837383322@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/livepatch/core.c       | 14 +-------------
 kernel/livepatch/patch.c      | 14 +-------------
 kernel/livepatch/shadow.c     | 14 +-------------
 kernel/livepatch/transition.c | 14 +-------------
 kernel/printk/internal.h      | 14 +-------------
 kernel/printk/printk_safe.c   | 14 +-------------
 6 files changed, 6 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 91cd519756d3..2398832947c6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * core.c - Kernel Live Patching Core
  *
  * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
  * Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 99cb3ad05eb4..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * patch.c - livepatch patching functions
  *
  * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
  * Copyright (C) 2014 SUSE
  * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * shadow.c - Shadow Variables
  *
  * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
  * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
  * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index c53370d596be..abb2a4a2cbb2 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * transition.c - Kernel Live Patching transition functions
  *
  * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 0f1898820cba..c8e6ab689d42 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -1,18 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * internal.h - printk internal definitions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/percpu.h>
 
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 0913b4d385de..b4045e782743 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <linux/preempt.h>
-- 
cgit v1.2.3


From 7170066ecd289cd8560695b6f86ba8dc723b6505 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 19 May 2019 15:51:55 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 25

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it would be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 6 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steve Winslow <swinslow@gmail.com>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Jilayne Lovejoy <opensource@jilayne.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190519154043.007767574@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/delayacct.c    | 11 +----------
 kernel/test_kprobes.c | 11 +----------
 2 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* delayacct.c - per-task delay accounting
  *
  * Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
  */
 
 #include <linux/sched.h>
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 7bca480151b0..76c997fdbc9d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * test_kprobes.c - simple sanity test for *probes
  *
  * Copyright IBM Corp. 2008
- *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
  */
 
 #define pr_fmt(fmt) "Kprobe smoke test: " fmt
-- 
cgit v1.2.3


From 55267c88c003a3648567beae7c90512d3e2ab15e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 18 Apr 2019 10:18:50 -0500
Subject: tracing: Prevent hist_field_var_ref() from accessing NULL
 tracing_map_elts

hist_field_var_ref() is an implementation of hist_field_fn_t(), which
can be called with a null tracing_map_elt elt param when assembling a
key in event_hist_trigger().

In the case of hist_field_var_ref() this doesn't make sense, because a
variable can only be resolved by looking it up using an already
assembled key i.e. a variable can't be used to assemble a key since
the key is required in order to access the variable.

Upper layers should prevent the user from constructing a key using a
variable in the first place, but in case one slips through, it
shouldn't cause a NULL pointer dereference.  Also if one does slip
through, we want to know about it, so emit a one-time warning in that
case.

Link: http://lkml.kernel.org/r/64ec8dc15c14d305295b64cdfcc6b2b9dd14753f.1555597045.git.tom.zanussi@linux.intel.com

Reported-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 7fca3457c705..06e7b9f66de6 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1854,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,
 	struct hist_elt_data *elt_data;
 	u64 var_val = 0;
 
+	if (WARN_ON_ONCE(!elt))
+		return var_val;
+
 	elt_data = elt->private_data;
 	var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
 
-- 
cgit v1.2.3


From c8d94a1878342fdffedaaf15201d951dfc147065 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 18 Apr 2019 10:18:51 -0500
Subject: tracing: Check keys for variable references in expressions too

There's an existing check for variable references in keys, but it
doesn't go far enough.  It checks whether a key field is a variable
reference but doesn't check whether it's an expression containing
variable references, which can cause the same problems for callers.

Use the existing field_has_hist_vars() function rather than a direct
top-level flag check to catch all possible variable references.

Link: http://lkml.kernel.org/r/e8c3d3d53db5ca90ceea5a46e5413103a6902fc7.1555597045.git.tom.zanussi@linux.intel.com

Cc: stable@vger.kernel.org
Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers")
Reported-by: Vincent Bernat <vincent@bernat.ch>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 06e7b9f66de6..2b76f9520bd0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -59,7 +59,7 @@
 	C(NO_CLOSING_PAREN,	"No closing paren found"),		\
 	C(SUBSYS_NOT_FOUND,	"Missing subsystem"),			\
 	C(INVALID_SUBSYS_EVENT,	"Invalid subsystem or event name"),	\
-	C(INVALID_REF_KEY,	"Using variable references as keys not supported"), \
+	C(INVALID_REF_KEY,	"Using variable references in keys not supported"), \
 	C(VAR_NOT_FOUND,	"Couldn't find variable"),		\
 	C(FIELD_NOT_FOUND,	"Couldn't find field"),
 
@@ -4506,7 +4506,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			goto out;
 		}
 
-		if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+		if (field_has_hist_vars(hist_field, 0))	{
 			hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
 			destroy_hist_field(hist_field, 0);
 			ret = -EINVAL;
-- 
cgit v1.2.3


From 9b2ca371b1505a547217b244f903ad3fb86fa5b4 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 18 Apr 2019 10:18:52 -0500
Subject: tracing: Add a check_val() check before updating cond_snapshot()
 track_val

Without this check a snapshot is taken whenever a bucket's max is hit,
rather than only when the global max is hit, as it should be.

Before:

  In this example, we do a first run of the workload (cyclictest),
  examine the output, note the max ('triggering value') (347), then do
  a second run and note the max again.

  In this case, the max in the second run (39) is below the max in the
  first run, but since we haven't cleared the histogram, the first max
  is still in the histogram and is higher than any other max, so it
  should still be the max for the snapshot.  It isn't however - the
  value should still be 347 after the second run.

  # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
  # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio,prev_comm):onmax($wakeup_lat).snapshot() if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger

  # cyclictest -p 80 -n -s -t 2 -D 2

  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist

  { next_pid:       2143 } hitcount:        199
    max:         44  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/4

  { next_pid:       2145 } hitcount:       1325
    max:         38  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/2

  { next_pid:       2144 } hitcount:       1982
    max:        347  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/6

  Snapshot taken (see tracing/snapshot).  Details:
      triggering value { onmax($wakeup_lat) }:        347
      triggered by event with key: { next_pid:       2144 }

  # cyclictest -p 80 -n -s -t 2 -D 2

  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist

  { next_pid:       2143 } hitcount:        199
    max:         44  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/4

  { next_pid:       2148 } hitcount:        199
    max:         16  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/1

  { next_pid:       2145 } hitcount:       1325
    max:         38  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/2

  { next_pid:       2150 } hitcount:       1326
    max:         39  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/4

  { next_pid:       2144 } hitcount:       1982
    max:        347  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/6

  { next_pid:       2149 } hitcount:       1983
    max:        130  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/0

  Snapshot taken (see tracing/snapshot).  Details:
    triggering value { onmax($wakeup_lat) }:    39
    triggered by event with key: { next_pid:       2150 }

After:

  In this example, we do a first run of the workload (cyclictest),
  examine the output, note the max ('triggering value') (375), then do
  a second run and note the max again.

  In this case, the max in the second run is still 375, the highest in
  any bucket, as it should be.

  # cyclictest -p 80 -n -s -t 2 -D 2

  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist

  { next_pid:       2072 } hitcount:        200
    max:         28  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/5

  { next_pid:       2074 } hitcount:       1323
    max:        375  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/2

  { next_pid:       2073 } hitcount:       1980
    max:        153  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/6

  Snapshot taken (see tracing/snapshot).  Details:
    triggering value { onmax($wakeup_lat) }:        375
    triggered by event with key: { next_pid:       2074 }

  # cyclictest -p 80 -n -s -t 2 -D 2

  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist

  { next_pid:       2101 } hitcount:        199
    max:         49  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/6

  { next_pid:       2072 } hitcount:        200
    max:         28  next_prio:        120  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/5

  { next_pid:       2074 } hitcount:       1323
    max:        375  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/2

  { next_pid:       2103 } hitcount:       1325
    max:         74  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/4

  { next_pid:       2073 } hitcount:       1980
    max:        153  next_prio:         19  next_comm: cyclictest
    prev_pid:          0  prev_prio:        120  prev_comm: swapper/6

  { next_pid:       2102 } hitcount:       1981
    max:         84  next_prio:         19  next_comm: cyclictest
    prev_pid:         12  prev_prio:        120  prev_comm: kworker/0:1

  Snapshot taken (see tracing/snapshot).  Details:
    triggering value { onmax($wakeup_lat) }:        375
    triggered by event with key: { next_pid:       2074 }

Link: http://lkml.kernel.org/r/95958351329f129c07504b4d1769c47a97b70d65.1555597045.git.tom.zanussi@linux.intel.com

Cc: stable@vger.kernel.org
Fixes: a3785b7eca8fd ("tracing: Add hist trigger snapshot() action")
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 2b76f9520bd0..ca6b0dff60c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3585,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
 	struct track_data *track_data = tr->cond_snapshot->cond_data;
 	struct hist_elt_data *elt_data, *track_elt_data;
 	struct snapshot_context *context = cond_data;
+	struct action_data *action;
 	u64 track_val;
 
 	if (!track_data)
 		return false;
 
+	action = track_data->action_data;
+
 	track_val = get_track_val(track_data->hist_data, context->elt,
 				  track_data->action_data);
 
+	if (!action->track_data.check_val(track_data->track_val, track_val))
+		return false;
+
 	track_data->track_val = track_val;
 	memcpy(track_data->key, context->key, track_data->key_len);
 
-- 
cgit v1.2.3


From 4eebe38a37f9397ffecd4bd3afbdf36838a97969 Mon Sep 17 00:00:00 2001
From: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Date: Thu, 28 Mar 2019 03:49:46 +0530
Subject: kernel/trace/trace.h: Remove duplicate header of trace_seq.h

Remove duplicate header which is included twice.

Link: http://lkml.kernel.org/r/1553725186-41442-1-git-send-email-jagdsh.linux@gmail.com

Reviewed-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Jagadeesh Pagadala <jagdsh.linux@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1974ce818ddb..82c70b63d375 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,7 +15,6 @@
 #include <linux/trace_seq.h>
 #include <linux/trace_events.h>
 #include <linux/compiler.h>
-#include <linux/trace_seq.h>
 #include <linux/glob.h>
 
 #ifdef CONFIG_FTRACE_SYSCALLS
-- 
cgit v1.2.3


From 1b038c6e05ff70a1e66e3e571c2e6106bdb75f53 Mon Sep 17 00:00:00 2001
From: Yabin Cui <yabinc@google.com>
Date: Fri, 17 May 2019 13:52:31 +0200
Subject: perf/ring_buffer: Fix exposing a temporarily decreased data_head

In perf_output_put_handle(), an IRQ/NMI can happen in below location and
write records to the same ring buffer:

	...
	local_dec_and_test(&rb->nest)
	...                          <-- an IRQ/NMI can happen here
	rb->user_page->data_head = head;
	...

In this case, a value A is written to data_head in the IRQ, then a value
B is written to data_head after the IRQ. And A > B. As a result,
data_head is temporarily decreased from A to B. And a reader may see
data_head < data_tail if it read the buffer frequently enough, which
creates unexpected behaviors.

This can be fixed by moving dec(&rb->nest) to after updating data_head,
which prevents the IRQ/NMI above from updating data_head.

[ Split up by peterz. ]

Signed-off-by: Yabin Cui <yabinc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: mark.rutland@arm.com
Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables")
Link: http://lkml.kernel.org/r/20190517115418.224478157@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 674b35383491..009467a60578 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -51,11 +51,18 @@ again:
 	head = local_read(&rb->head);
 
 	/*
-	 * IRQ/NMI can happen here, which means we can miss a head update.
+	 * IRQ/NMI can happen here and advance @rb->head, causing our
+	 * load above to be stale.
 	 */
 
-	if (!local_dec_and_test(&rb->nest))
+	/*
+	 * If this isn't the outermost nesting, we don't have to update
+	 * @rb->user_page->data_head.
+	 */
+	if (local_read(&rb->nest) > 1) {
+		local_dec(&rb->nest);
 		goto out;
+	}
 
 	/*
 	 * Since the mmap() consumer (userspace) can run on a different CPU:
@@ -87,9 +94,18 @@ again:
 	rb->user_page->data_head = head;
 
 	/*
-	 * Now check if we missed an update -- rely on previous implied
-	 * compiler barriers to force a re-read.
+	 * We must publish the head before decrementing the nest count,
+	 * otherwise an IRQ/NMI can publish a more recent head value and our
+	 * write will (temporarily) publish a stale value.
+	 */
+	barrier();
+	local_set(&rb->nest, 0);
+
+	/*
+	 * Ensure we decrement @rb->nest before we validate the @rb->head.
+	 * Otherwise we cannot be sure we caught the 'last' nested update.
 	 */
+	barrier();
 	if (unlikely(head != local_read(&rb->head))) {
 		local_inc(&rb->nest);
 		goto again;
-- 
cgit v1.2.3


From 3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 May 2019 13:52:32 +0200
Subject: perf/ring_buffer: Add ordering to rb->nest increment

Similar to how decrementing rb->next too early can cause data_head to
(temporarily) be observed to go backward, so too can this happen when
we increment too late.

This barrier() ensures the rb->head load happens after the increment,
both the one in the 'goto again' path, as the one from
perf_output_get_handle() -- albeit very unlikely to matter for the
latter.

Suggested-by: Yabin Cui <yabinc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mark.rutland@arm.com
Cc: namhyung@kernel.org
Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables")
Link: http://lkml.kernel.org/r/20190517115418.309516009@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 009467a60578..4b5f8d932400 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
 	unsigned long head;
 
 again:
+	/*
+	 * In order to avoid publishing a head value that goes backwards,
+	 * we must ensure the load of @rb->head happens after we've
+	 * incremented @rb->nest.
+	 *
+	 * Otherwise we can observe a @rb->head value before one published
+	 * by an IRQ/NMI happening between the load and the increment.
+	 */
+	barrier();
 	head = local_read(&rb->head);
 
 	/*
-- 
cgit v1.2.3


From 4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 May 2019 13:52:33 +0200
Subject: perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page
 data

We must use {READ,WRITE}_ONCE() on rb->user_page data such that
concurrent usage will see whole values. A few key sites were missing
this.

Suggested-by: Yabin Cui <yabinc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mark.rutland@arm.com
Cc: namhyung@kernel.org
Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1")
Link: http://lkml.kernel.org/r/20190517115418.394192145@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4b5f8d932400..7a0c73e4b3eb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -100,7 +100,7 @@ again:
 	 * See perf_output_begin().
 	 */
 	smp_wmb(); /* B, matches C */
-	rb->user_page->data_head = head;
+	WRITE_ONCE(rb->user_page->data_head, head);
 
 	/*
 	 * We must publish the head before decrementing the nest count,
@@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 		perf_event_aux_event(handle->event, aux_head, size,
 				     handle->aux_flags);
 
-	rb->user_page->aux_head = rb->aux_head;
+	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
 	if (rb_need_aux_wakeup(rb))
 		wakeup = true;
 
@@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 
 	rb->aux_head += size;
 
-	rb->user_page->aux_head = rb->aux_head;
+	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
 	if (rb_need_aux_wakeup(rb)) {
 		perf_output_wakeup(handle);
 		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
-- 
cgit v1.2.3


From 5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 May 2019 13:52:34 +0200
Subject: perf/ring-buffer: Use regular variables for nesting

While the IRQ/NMI will nest, the nest-count will be invariant over the
actual exception, since it will decrement equal to increment.

This means we can -- carefully -- use a regular variable since the
typical LOAD-STORE race doesn't exist (similar to preempt_count).

This optimizes the ring-buffer for all LOAD-STORE architectures, since
they need to use atomic ops to implement local_t.

Suggested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mark.rutland@arm.com
Cc: namhyung@kernel.org
Cc: yabinc@google.com
Link: http://lkml.kernel.org/r/20190517115418.481392777@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h    |  4 ++--
 kernel/events/ring_buffer.c | 41 ++++++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
 	atomic_t			poll;		/* POLL_ for wakeups */
 
 	local_t				head;		/* write position    */
-	local_t				nest;		/* nested writers    */
+	unsigned int			nest;		/* nested writers    */
 	local_t				events;		/* event limit       */
 	local_t				wakeup;		/* wakeup stamp      */
 	local_t				lost;		/* nr records lost   */
@@ -41,7 +41,7 @@ struct ring_buffer {
 
 	/* AUX area */
 	long				aux_head;
-	local_t				aux_nest;
+	unsigned int			aux_nest;
 	long				aux_wakeup;	/* last aux_watermark boundary crossed by aux_head */
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7a0c73e4b3eb..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
 	struct ring_buffer *rb = handle->rb;
 
 	preempt_disable();
-	local_inc(&rb->nest);
+
+	/*
+	 * Avoid an explicit LOAD/STORE such that architectures with memops
+	 * can use them.
+	 */
+	(*(volatile unsigned int *)&rb->nest)++;
 	handle->wakeup = local_read(&rb->wakeup);
 }
 
@@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
 {
 	struct ring_buffer *rb = handle->rb;
 	unsigned long head;
+	unsigned int nest;
+
+	/*
+	 * If this isn't the outermost nesting, we don't have to update
+	 * @rb->user_page->data_head.
+	 */
+	nest = READ_ONCE(rb->nest);
+	if (nest > 1) {
+		WRITE_ONCE(rb->nest, nest - 1);
+		goto out;
+	}
 
 again:
 	/*
@@ -64,15 +80,6 @@ again:
 	 * load above to be stale.
 	 */
 
-	/*
-	 * If this isn't the outermost nesting, we don't have to update
-	 * @rb->user_page->data_head.
-	 */
-	if (local_read(&rb->nest) > 1) {
-		local_dec(&rb->nest);
-		goto out;
-	}
-
 	/*
 	 * Since the mmap() consumer (userspace) can run on a different CPU:
 	 *
@@ -108,7 +115,7 @@ again:
 	 * write will (temporarily) publish a stale value.
 	 */
 	barrier();
-	local_set(&rb->nest, 0);
+	WRITE_ONCE(rb->nest, 0);
 
 	/*
 	 * Ensure we decrement @rb->nest before we validate the @rb->head.
@@ -116,7 +123,7 @@ again:
 	 */
 	barrier();
 	if (unlikely(head != local_read(&rb->head))) {
-		local_inc(&rb->nest);
+		WRITE_ONCE(rb->nest, 1);
 		goto again;
 	}
 
@@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	struct perf_event *output_event = event;
 	unsigned long aux_head, aux_tail;
 	struct ring_buffer *rb;
+	unsigned int nest;
 
 	if (output_event->parent)
 		output_event = output_event->parent;
@@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (!refcount_inc_not_zero(&rb->aux_refcount))
 		goto err;
 
+	nest = READ_ONCE(rb->aux_nest);
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
 	 * writers are caught early
 	 */
-	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+	if (WARN_ON_ONCE(nest))
 		goto err_put;
 
+	WRITE_ONCE(rb->aux_nest, nest + 1);
+
 	aux_head = rb->aux_head;
 
 	handle->rb = rb;
@@ -419,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 		if (!handle->size) { /* A, matches D */
 			event->pending_disable = smp_processor_id();
 			perf_output_wakeup(handle);
-			local_set(&rb->aux_nest, 0);
+			WRITE_ONCE(rb->aux_nest, 0);
 			goto err_put;
 		}
 	}
@@ -508,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 
 	handle->event = NULL;
 
-	local_set(&rb->aux_nest, 0);
+	WRITE_ONCE(rb->aux_nest, 0);
 	/* can't be last */
 	rb_free_aux(rb);
 	ring_buffer_put(rb);
-- 
cgit v1.2.3


From b4d0d230ccfb5d1a9ea85da64aa584df7c148ee9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 May 2019 19:08:01 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 36

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public licence as published by
  the free software foundation either version 2 of the licence or at
  your option any later version

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 114 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190520170857.552531963@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cred.c            | 6 +-----
 kernel/module-internal.h | 6 +-----
 kernel/module_signing.c  | 6 +-----
 3 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..e74ffdc98a92 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Task credentials management - see Documentation/security/credentials.rst
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/export.h>
 #include <linux/cred.h>
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index d354341f8cc0..33783abc377b 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Module internals
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/elf.h>
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..b10fb1986ca9 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Module signature checker
  *
  * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 6ff3f917e06625f9612f0dbcda10bef45b099b00 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 May 2019 19:08:03 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 38

Based on 1 normalized pattern(s):

  this file is released under the gplv2 and any later version

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 1 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190520170857.732920462@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/stop_machine.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7231fb5953fc..2b5a6754646f 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * kernel/stop_machine.c
  *
@@ -5,8 +6,6 @@
  * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
  * Copyright (C) 2010		SUSE Linux Products GmbH
  * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2 and any later version.
  */
 #include <linux/completion.h>
 #include <linux/cpu.h>
-- 
cgit v1.2.3


From f7b101d33046a837c2aa4526cef28a3c785d7af2 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 15 May 2019 17:35:51 -0400
Subject: kheaders: Move from proc to sysfs

The kheaders archive consisting of the kernel headers used for compiling
bpf programs is in /proc. However there is concern that moving it here
will make it permanent. Let us move it to /sys/kernel as discussed [1].

[1] https://lore.kernel.org/patchwork/patch/1067310/#1265969

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/Makefile        |  4 +--
 kernel/gen_ikh_data.sh | 89 --------------------------------------------------
 kernel/gen_kheaders.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kheaders.c      | 40 +++++++++--------------
 4 files changed, 107 insertions(+), 115 deletions(-)
 delete mode 100755 kernel/gen_ikh_data.sh
 create mode 100755 kernel/gen_kheaders.sh

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
 	$(call cmd,genikh)
 
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
deleted file mode 100755
index 591a94f7b387..000000000000
--- a/kernel/gen_ikh_data.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# This script generates an archive consisting of kernel headers
-# for CONFIG_IKHEADERS_PROC.
-set -e
-spath="$(dirname "$(readlink -f "$0")")"
-kroot="$spath/.."
-outdir="$(pwd)"
-tarfile=$1
-cpio_dir=$outdir/$tarfile.tmp
-
-# Script filename relative to the kernel source root
-# We add it to the archive because it is small and any changes
-# to this script will also cause a rebuild of the archive.
-sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
-
-src_file_list="
-include/
-arch/$SRCARCH/include/
-$sfile
-"
-
-obj_file_list="
-include/
-arch/$SRCARCH/include/
-"
-
-# Support incremental builds by skipping archive generation
-# if timestamps of files being archived are not changed.
-
-# This block is useful for debugging the incremental builds.
-# Uncomment it for debugging.
-# iter=1
-# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
-# else; 	iter=$(($(cat /tmp/iter) + 1)); fi
-# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
-# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
-
-# include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed. This causes pointless regeneration, so let us
-# ignore them for md5 calculation.
-pushd $kroot > /dev/null
-src_files_md5="$(find $src_file_list -type f                       |
-		grep -v "include/generated/compile.h"		   |
-		xargs ls -lR | md5sum | cut -d ' ' -f1)"
-popd > /dev/null
-obj_files_md5="$(find $obj_file_list -type f                       |
-		grep -v "include/generated/compile.h"		   |
-		xargs ls -lR | md5sum | cut -d ' ' -f1)"
-
-if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
-if [ -f kernel/kheaders.md5 ] &&
-	[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
-	[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
-	[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
-		exit
-fi
-
-if [ "${quiet}" != "silent_" ]; then
-       echo "  GEN     $tarfile"
-fi
-
-rm -rf $cpio_dir
-mkdir $cpio_dir
-
-pushd $kroot > /dev/null
-for f in $src_file_list;
-	do find "$f" ! -name "*.cmd" ! -name ".*";
-done | cpio --quiet -pd $cpio_dir
-popd > /dev/null
-
-# The second CPIO can complain if files already exist which can
-# happen with out of tree builds. Just silence CPIO for now.
-for f in $obj_file_list;
-	do find "$f" ! -name "*.cmd" ! -name ".*";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
-
-# Remove comments except SDPX lines
-find $cpio_dir -type f -print0 |
-	xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
-
-tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
-
-echo "$src_files_md5" > kernel/kheaders.md5
-echo "$obj_files_md5" >> kernel/kheaders.md5
-echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
-
-rm -rf $cpio_dir
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
new file mode 100755
index 000000000000..581b83534587
--- /dev/null
+++ b/kernel/gen_kheaders.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# iter=1
+# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else; 	iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f                       |
+		grep -v "include/generated/compile.h"		   |
+		xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f                       |
+		grep -v "include/generated/compile.h"		   |
+		xargs ls -lR | md5sum | cut -d ' ' -f1)"
+
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+	[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+	[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+	[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+		exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+       echo "  GEN     $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $kroot > /dev/null
+for f in $src_file_list;
+	do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+	do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+	xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 70ae6052920d..8f69772af77b 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -8,9 +8,8 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/proc_fs.h>
+#include <linux/kobject.h>
 #include <linux/init.h>
-#include <linux/uaccess.h>
 
 /*
  * Define kernel_headers_data and kernel_headers_data_end, within which the
@@ -31,39 +30,32 @@ extern char kernel_headers_data;
 extern char kernel_headers_data_end;
 
 static ssize_t
-ikheaders_read_current(struct file *file, char __user *buf,
-		      size_t len, loff_t *offset)
+ikheaders_read(struct file *file,  struct kobject *kobj,
+	       struct bin_attribute *bin_attr,
+	       char *buf, loff_t off, size_t len)
 {
-	return simple_read_from_buffer(buf, len, offset,
-				       &kernel_headers_data,
-				       &kernel_headers_data_end -
-				       &kernel_headers_data);
+	memcpy(buf, &kernel_headers_data + off, len);
+	return len;
 }
 
-static const struct file_operations ikheaders_file_ops = {
-	.read = ikheaders_read_current,
-	.llseek = default_llseek,
+static struct bin_attribute kheaders_attr __ro_after_init = {
+	.attr = {
+		.name = "kheaders.tar.xz",
+		.mode = 0444,
+	},
+	.read = &ikheaders_read,
 };
 
 static int __init ikheaders_init(void)
 {
-	struct proc_dir_entry *entry;
-
-	/* create the current headers file */
-	entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
-			    &ikheaders_file_ops);
-	if (!entry)
-		return -ENOMEM;
-
-	proc_set_size(entry,
-		      &kernel_headers_data_end -
-		      &kernel_headers_data);
-	return 0;
+	kheaders_attr.size = (&kernel_headers_data_end -
+			      &kernel_headers_data);
+	return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
 }
 
 static void __exit ikheaders_cleanup(void)
 {
-	remove_proc_entry("kheaders.tar.xz", NULL);
+	sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
 }
 
 module_init(ikheaders_init);
-- 
cgit v1.2.3


From 1457dc9ed8da871fbbc0a2ebdaed0405eeeed0cf Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 15 May 2019 17:35:52 -0400
Subject: kheaders: Do not regenerate archive if config is not changed

Linus reported an issue that doing an allmodconfig was causing the
kheaders archive to be regenerated even though the config is the same.
This patch fixes the issue by ignoring the config-related header files
for "knowing when to regenerate based on timestamps".  Instead, if the
CONFIG_X_Y option really changes, then we there are the
include/config/X/Y.h which will already tells us "if a config really
changed". So we don't really need these files for regeneration detection
anyway, and ignoring them fixes Linus's issue.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/gen_kheaders.sh | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 581b83534587..9a34e1d9bd7f 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -31,9 +31,8 @@ arch/$SRCARCH/include/
 
 # This block is useful for debugging the incremental builds.
 # Uncomment it for debugging.
-# iter=1
-# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
-# else; 	iter=$(($(cat /tmp/iter) + 1)); fi
+# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
+# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
 # find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
 # find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
 
@@ -43,10 +42,18 @@ arch/$SRCARCH/include/
 pushd $kroot > /dev/null
 src_files_md5="$(find $src_file_list -type f                       |
 		grep -v "include/generated/compile.h"		   |
+		grep -v "include/generated/autoconf.h"		   |
+		grep -v "include/config/auto.conf"		   |
+		grep -v "include/config/auto.conf.cmd"		   |
+		grep -v "include/config/tristate.conf"		   |
 		xargs ls -lR | md5sum | cut -d ' ' -f1)"
 popd > /dev/null
 obj_files_md5="$(find $obj_file_list -type f                       |
 		grep -v "include/generated/compile.h"		   |
+		grep -v "include/generated/autoconf.h"		   |
+		grep -v "include/config/auto.conf"                 |
+		grep -v "include/config/auto.conf.cmd"		   |
+		grep -v "include/config/tristate.conf"		   |
 		xargs ls -lR | md5sum | cut -d ' ' -f1)"
 
 if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
@@ -82,7 +89,7 @@ find $cpio_dir -type f -print0 |
 
 tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
 
-echo "$src_files_md5" > kernel/kheaders.md5
+echo "$src_files_md5" >  kernel/kheaders.md5
 echo "$obj_files_md5" >> kernel/kheaders.md5
 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
 
-- 
cgit v1.2.3


From 51816e9e113934281b44f1a352852ef7631e75ea Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 24 May 2019 15:42:22 -0400
Subject: locking/lock_events: Use this_cpu_add() when necessary

The kernel test robot has reported that the use of __this_cpu_add()
causes bug messages like:

  BUG: using __this_cpu_add() in preemptible [00000000] code: ...

Given the imprecise nature of the count and the possibility of resetting
the count and doing the measurement again, this is not really a big
problem to use the unprotected __this_cpu_*() functions.

To make the preemption checking code happy, the this_cpu_*() functions
will be used if CONFIG_DEBUG_PREEMPT is defined.

The imprecise nature of the locking counts are also documented with
the suggestion that we should run the measurement a few times with the
counts reset in between to get a better picture of what is going on
under the hood.

Fixes: a8654596f0371 ("locking/rwsem: Enable lock event counting")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/lock_events.h | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index feb1acc54611..46b71af8eef2 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -30,13 +30,51 @@ enum lock_events {
  */
 DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
 
+/*
+ * The purpose of the lock event counting subsystem is to provide a low
+ * overhead way to record the number of specific locking events by using
+ * percpu counters. It is the percpu sum that matters, not specifically
+ * how many of them happens in each cpu.
+ *
+ * It is possible that the same percpu counter may be modified in both
+ * the process and interrupt contexts. For architectures that perform
+ * percpu operation with multiple instructions, it is possible to lose
+ * count if a process context percpu update is interrupted in the middle
+ * and the same counter is updated in the interrupt context. Therefore,
+ * the generated percpu sum may not be precise. The error, if any, should
+ * be small and insignificant.
+ *
+ * For those architectures that do multi-instruction percpu operation,
+ * preemption in the middle and moving the task to another cpu may cause
+ * a larger error in the count. Again, this will be few and far between.
+ * Given the imprecise nature of the count and the possibility of resetting
+ * the count and doing the measurement again, this is not really a big
+ * problem.
+ *
+ * To get a better picture of what is happening under the hood, it is
+ * suggested that a few measurements should be taken with the counts
+ * reset in between to stamp out outliner because of these possible
+ * error conditions.
+ *
+ * To minimize overhead, we use __this_cpu_*() in all cases except when
+ * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*()
+ * will be used to avoid the appearance of unwanted BUG messages.
+ */
+#ifdef CONFIG_DEBUG_PREEMPT
+#define lockevent_percpu_inc(x)		this_cpu_inc(x)
+#define lockevent_percpu_add(x, v)	this_cpu_add(x, v)
+#else
+#define lockevent_percpu_inc(x)		__this_cpu_inc(x)
+#define lockevent_percpu_add(x, v)	__this_cpu_add(x, v)
+#endif
+
 /*
  * Increment the PV qspinlock statistical counters
  */
 static inline void __lockevent_inc(enum lock_events event, bool cond)
 {
 	if (cond)
-		__this_cpu_inc(lockevents[event]);
+		lockevent_percpu_inc(lockevents[event]);
 }
 
 #define lockevent_inc(ev)	  __lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -44,7 +82,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
 
 static inline void __lockevent_add(enum lock_events event, int inc)
 {
-	__this_cpu_add(lockevents[event], inc);
+	lockevent_percpu_add(lockevents[event], inc);
 }
 
 #define lockevent_add(ev, c)	__lockevent_add(LOCKEVENT_ ##ev, c)
-- 
cgit v1.2.3


From 0c97bf863efce63d6ab7971dad811601e6171d2f Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Date: Thu, 23 May 2019 14:45:35 +0200
Subject: tracing: Silence GCC 9 array bounds warning

Starting with GCC 9, -Warray-bounds detects cases when memset is called
starting on a member of a struct but the size to be cleared ends up
writing over further members.

Such a call happens in the trace code to clear, at once, all members
after and including `seq` on struct trace_iterator:

    In function 'memset',
        inlined from 'ftrace_dump' at kernel/trace/trace.c:8914:3:
    ./include/linux/string.h:344:9: warning: '__builtin_memset' offset
    [8505, 8560] from the object at 'iter' is out of the bounds of
    referenced subobject 'seq' with type 'struct trace_seq' at offset
    4368 [-Warray-bounds]
      344 |  return __builtin_memset(p, c, size);
          |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~

In order to avoid GCC complaining about it, we compute the address
ourselves by adding the offsetof distance instead of referring
directly to the member.

Since there are two places doing this clear (trace.c and trace_kdb.c),
take the chance to move the workaround into a single place in
the internal header.

Link: http://lkml.kernel.org/r/20190523124535.GA12931@gmail.com

Signed-off-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
[ Removed unnecessary parenthesis around "iter" ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c     |  6 +-----
 kernel/trace/trace.h     | 18 ++++++++++++++++++
 kernel/trace/trace_kdb.c |  6 +-----
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c92b3d9ea30..1c80521fd436 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8910,12 +8910,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 
 		cnt++;
 
-		/* reset all but tr, trace, and overruns */
-		memset(&iter.seq, 0,
-		       sizeof(struct trace_iterator) -
-		       offsetof(struct trace_iterator, seq));
+		trace_iterator_reset(&iter);
 		iter.iter_flags |= TRACE_FILE_LAT_FMT;
-		iter.pos = -1;
 
 		if (trace_find_next_entry_inc(&iter) != NULL) {
 			int ret;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 82c70b63d375..005f08629b8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1966,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
 
 extern struct trace_iterator *tracepoint_print_iter;
 
+/*
+ * Reset the state of the trace_iterator so that it can read consumed data.
+ * Normally, the trace_iterator is used for reading the data when it is not
+ * consumed, and must retain state.
+ */
+static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
+{
+	const size_t offset = offsetof(struct trace_iterator, seq);
+
+	/*
+	 * Keep gcc from complaining about overwriting more than just one
+	 * member in the structure.
+	 */
+	memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
+
+	iter->pos = -1;
+}
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 6c1ae6b752d1..cca65044c14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -37,12 +37,8 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (skip_entries)
 		kdb_printf("(skipping %d entries)\n", skip_entries);
 
-	/* reset all but tr, trace, and overruns */
-	memset(&iter.seq, 0,
-		   sizeof(struct trace_iterator) -
-		   offsetof(struct trace_iterator, seq));
+	trace_iterator_reset(&iter);
 	iter.iter_flags |= TRACE_FILE_LAT_FMT;
-	iter.pos = -1;
 
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
-- 
cgit v1.2.3


From bb1869012d7b78d1474808cb4c8bd8b272645876 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 16 May 2019 12:43:19 +0200
Subject: ACPI: PM: Call pm_set_suspend_via_firmware() during hibernation

On systems with ACPI platform firmware the last stage of hibernation
is analogous to system suspend to S3 (suspend-to-RAM), so it should
be handled analogously.  In particular, pm_suspend_via_firmware()
should return 'true' in that stage to let the callers of it know that
control will be passed to the platform firmware going forward, so
pm_set_suspend_via_firmware() needs to be called then in analogy with
acpi_suspend_begin().

However, the platform hibernation ->begin() callback is invoked
during the "freeze" transition (before creating a snapshot image of
system memory) as well as during the "hibernate" transition which is
the last stage of it and pm_set_suspend_via_firmware() should be
invoked by that callback in the latter stage only.

In order to implement that redefine the hibernation ->begin()
callback to take a pm_message_t argument to indicate which stage
of hibernation is taking place and rework acpi_hibernation_begin()
and acpi_hibernation_begin_old() to take it into account as needed.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8c272df7154..97522630b1b6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -129,7 +129,7 @@ static int hibernation_test(int level) { return 0; }
 static int platform_begin(int platform_mode)
 {
 	return (platform_mode && hibernation_ops) ?
-		hibernation_ops->begin() : 0;
+		hibernation_ops->begin(PMSG_FREEZE) : 0;
 }
 
 /**
@@ -542,7 +542,7 @@ int hibernation_platform_enter(void)
 	 * hibernation_ops->finish() before saving the image, so we should let
 	 * the firmware know that we're going to enter the sleep state after all
 	 */
-	error = hibernation_ops->begin();
+	error = hibernation_ops->begin(PMSG_HIBERNATE);
 	if (error)
 		goto Close;
 
-- 
cgit v1.2.3


From dfb4a6f2191a80c8b790117d0ff592fd712d3296 Mon Sep 17 00:00:00 2001
From: Tomas Bortoli <tomasbortoli@gmail.com>
Date: Tue, 28 May 2019 17:43:38 +0200
Subject: tracing: Avoid memory leak in predicate_parse()

In case of errors, predicate_parse() goes to the out_free label
to free memory and to return an error code.

However, predicate_parse() does not free the predicates of the
temporary prog_stack array, thence leaking them.

Link: http://lkml.kernel.org/r/20190528154338.29976-1-tomasbortoli@gmail.com

Cc: stable@vger.kernel.org
Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster")
Reported-by: syzbot+6b8e0fb820e570c59e19@syzkaller.appspotmail.com
Signed-off-by: Tomas Bortoli <tomasbortoli@gmail.com>
[ Added protection around freeing prog_stack[i].pred ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d3e59312ef40..5079d1db3754 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -428,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
 	op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
 	if (!op_stack)
 		return ERR_PTR(-ENOMEM);
-	prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
+	prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
 	if (!prog_stack) {
 		parse_error(pe, -ENOMEM, 0);
 		goto out_free;
@@ -579,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
 out_free:
 	kfree(op_stack);
 	kfree(inverts);
-	kfree(prog_stack);
+	if (prog_stack) {
+		for (i = 0; prog_stack[i].pred; i++)
+			kfree(prog_stack[i].pred);
+		kfree(prog_stack);
+	}
 	return ERR_PTR(ret);
 }
 
-- 
cgit v1.2.3


From f6e2aa91a46d2bc79fce9b93a988dbe7655c90c0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 May 2019 18:46:37 -0500
Subject: signal/ptrace: Don't leak unitialized kernel memory with
 PTRACE_PEEK_SIGINFO

Recently syzbot in conjunction with KMSAN reported that
ptrace_peek_siginfo can copy an uninitialized siginfo to userspace.
Inspecting ptrace_peek_siginfo confirms this.

The problem is that off when initialized from args.off can be
initialized to a negaive value.  At which point the "if (off >= 0)"
test to see if off became negative fails because off started off
negative.

Prevent the core problem by adding a variable found that is only true
if a siginfo is found and copied to a temporary in preparation for
being copied to userspace.

Prevent args.off from being truncated when being assigned to off by
testing that off is <= the maximum possible value of off.  Convert off
to an unsigned long so that we should not have to truncate args.off,
we have well defined overflow behavior so if we add another check we
won't risk fighting undefined compiler behavior, and so that we have a
type whose maximum value is easy to test for.

Cc: Andrei Vagin <avagin@gmail.com>
Cc: stable@vger.kernel.org
Reported-by: syzbot+0d602a1b0d8c95bdf299@syzkaller.appspotmail.com
Fixes: 84c751bd4aeb ("ptrace: add ability to retrieve signals without removing from a queue (v4)")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6f357f4fc859..02c6528ead5c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -704,6 +704,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 	if (arg.nr < 0)
 		return -EINVAL;
 
+	/* Ensure arg.off fits in an unsigned long */
+	if (arg.off > ULONG_MAX)
+		return 0;
+
 	if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
 		pending = &child->signal->shared_pending;
 	else
@@ -711,18 +715,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 
 	for (i = 0; i < arg.nr; ) {
 		kernel_siginfo_t info;
-		s32 off = arg.off + i;
+		unsigned long off = arg.off + i;
+		bool found = false;
 
 		spin_lock_irq(&child->sighand->siglock);
 		list_for_each_entry(q, &pending->list, list) {
 			if (!off--) {
+				found = true;
 				copy_siginfo(&info, &q->info);
 				break;
 			}
 		}
 		spin_unlock_irq(&child->sighand->siglock);
 
-		if (off >= 0) /* beyond the end of the list */
+		if (!found) /* beyond the end of the list */
 			break;
 
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3


From 2874c5fd284268364ece81a7bd936f3c8168e567 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 May 2019 08:55:01 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 3029 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190527070032.746973796@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/core.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..7c473f208a10 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Linux Socket Filter - Kernel level socket filtering
  *
@@ -12,11 +13,6 @@
  *	Alexei Starovoitov <ast@plumgrid.com>
  *	Daniel Borkmann <dborkman@redhat.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
  * Andi Kleen - Fix a few bad bugs and races.
  * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  */
-- 
cgit v1.2.3


From 1a59d1b8e05ea6ab45f7e18897de1ef0e6bc3da6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 May 2019 08:55:05 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it will be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details you
  should have received a copy of the gnu general public license along
  with this program if not write to the free software foundation inc
  59 temple place suite 330 boston ma 02111 1307 usa

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 1334 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Richard Fontana <rfontana@redhat.com>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190527070033.113240726@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/audit.c       | 15 +--------------
 kernel/audit.h       | 15 +--------------
 kernel/audit_watch.c | 15 +--------------
 kernel/auditfilter.c | 15 +--------------
 kernel/extable.c     | 14 +-------------
 kernel/futex.c       | 15 +--------------
 kernel/kprobes.c     | 15 +--------------
 kernel/module.c      | 14 +-------------
 kernel/params.c      | 14 +-------------
 kernel/tracepoint.c  | 15 +--------------
 10 files changed, 10 insertions(+), 137 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b96bf69183f4..486c968214d9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit.c -- Auditing support
  * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
  * System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
  * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
  * All Rights Reserved.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
  * Written by Rickard E. (Rik) Faith <faith@redhat.com>
  *
  * Goals: 1) Integrate fully with Security Modules.
diff --git a/kernel/audit.h b/kernel/audit.h
index 2071725a999f..6c076d4982da 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* audit -- definition of audit_context structure and supporting types 
  *
  * Copyright 2003-2004 Red Hat, Inc.
  * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/fs.h>
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b50c574223fa..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit_watch.c -- watching inodes
  *
  * Copyright 2003-2009 Red Hat, Inc.
  * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/file.h>
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 303fb04770ce..9f8e190e3bea 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* auditfilter.c -- filtering of audit events
  *
  * Copyright 2003-2004 Red Hat, Inc.
  * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Rewritten by Rusty Russell, on the backs of many others...
    Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/ftrace.h>
 #include <linux/memory.h>
diff --git a/kernel/futex.c b/kernel/futex.c
index 2268b97d5439..4b5b468c58b6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Fast Userspace Mutexes (which I call "Futexes!").
  *  (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
  *
  *  "The futexes are also cursed."
  *  "But they come in a choice of three flavours!"
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include <linux/compat.h>
 #include <linux/slab.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b1ea30a5540e..445337c107e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Kernel Probes (KProbes)
  *  kernel/kprobes.c
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
  * Copyright (C) IBM Corporation, 2002, 2004
  *
  * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
diff --git a/kernel/module.c b/kernel/module.c
index 6e6712b3aaf5..80c7c09584cf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
    Copyright (C) 2002 Richard Henderson
    Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/export.h>
 #include <linux/extable.h>
diff --git a/kernel/params.c b/kernel/params.c
index ce89f757e6da..cf448785d058 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Helpers for initial module or kernel cmdline parsing
    Copyright (C) 2001 Rusty Russell.
 
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 46f2ab1e08a9..df3ade14ccbd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2008-2014 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/module.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3


From c942fddf8793b2013be8c901b47d0a8dc02bf99f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 May 2019 08:55:06 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 157

Based on 3 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version this program is distributed in the
  hope that it will be useful but without any warranty without even
  the implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version [author] [kishon] [vijay] [abraham]
  [i] [kishon]@[ti] [com] this program is distributed in the hope that
  it will be useful but without any warranty without even the implied
  warranty of merchantability or fitness for a particular purpose see
  the gnu general public license for more details

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation either version 2 of the license or at
  your option any later version [author] [graeme] [gregory]
  [gg]@[slimlogic] [co] [uk] [author] [kishon] [vijay] [abraham] [i]
  [kishon]@[ti] [com] [based] [on] [twl6030]_[usb] [c] [author] [hema]
  [hk] [hemahk]@[ti] [com] this program is distributed in the hope
  that it will be useful but without any warranty without even the
  implied warranty of merchantability or fitness for a particular
  purpose see the gnu general public license for more details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-or-later

has been chosen to replace the boilerplate/reference in 1105 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Richard Fontana <rfontana@redhat.com>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190527070033.202006027@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/audit_fsnotify.c         | 11 +----------
 kernel/locking/qrwlock.c        | 11 +----------
 kernel/locking/qspinlock.c      | 11 +----------
 kernel/locking/qspinlock_stat.h | 10 +---------
 kernel/sched/membarrier.c       | 11 +----------
 kernel/taskstats.c              | 12 +-----------
 kernel/tsacct.c                 | 13 +------------
 7 files changed, 7 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index b5737b826951..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit_fsnotify.c -- tracking inodes
  *
  * Copyright 2003-2009,2014-2015 Red Hat, Inc.
  * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Queued read/write locks
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
  *
  * Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e14b32c69639..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Queued spinlock
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2013-2014,2018 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 54152670ff24..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,13 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  *
  * Authors: Waiman Long <longman@redhat.com>
  */
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3cd8a3a795d2..aa8d75804108 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  *
  * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 #include "sched.h"
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5f852b8f59f7..13a0f2e6ebc2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -1,19 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * taskstats.c - Export per-task statistics to userland
  *
  * Copyright (C) Shailabh Nagar, IBM Corp. 2006
  *           (C) Balbir Singh,   IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #include <linux/kernel.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 370724b45391..7be3e7530841 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * tsacct.c - System accounting over taskstats interface
  *
  * Copyright (C) Jay Lan,	<jlan@sgi.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 468e15fdc2ec7048ab1ae93e200559151c84647e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 May 2019 08:55:17 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 170

Based on 1 normalized pattern(s):

  this file is release under the gplv2

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 1 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Richard Fontana <rfontana@redhat.com>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190527070034.216732358@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/ksysfs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
  * 		     are not related to any other subsystem
  *
  * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- * 
- * This file is release under the GPLv2
- *
  */
 
 #include <linux/kobject.h>
-- 
cgit v1.2.3


From 25763b3c864cf517d686661012d184ee47a49b4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 May 2019 10:10:09 -0700
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 206

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of version 2 of the gnu general public license as
  published by the free software foundation

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 107 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Richard Fontana <rfontana@redhat.com>
Reviewed-by: Steve Winslow <swinslow@gmail.com>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190528171438.615055994@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/bpf_lru_list.c    | 5 +----
 kernel/bpf/bpf_lru_list.h    | 5 +----
 kernel/bpf/map_in_map.c      | 5 +----
 kernel/bpf/map_in_map.h      | 5 +----
 kernel/bpf/percpu_freelist.c | 5 +----
 kernel/bpf/percpu_freelist.h | 5 +----
 kernel/bpf/stackmap.c        | 5 +----
 7 files changed, 7 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #ifndef __BPF_LRU_LIST_H_
 #define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3dff41403583..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #include <linux/slab.h>
 #include <linux/bpf.h>
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #ifndef __MAP_IN_MAP_H__
 #define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #include "percpu_freelist.h"
 
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #ifndef __PERCPU_FREELIST_H__
 #define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..d38e49f943a1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
  */
 #include <linux/bpf.h>
 #include <linux/jhash.h>
-- 
cgit v1.2.3


From 6b115bf58e6f013ca75e7115aabcbd56c20ff31d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 31 May 2019 10:38:57 -0700
Subject: cgroup: Call cgroup_release() before __exit_signal()

cgroup_release() calls cgroup_subsys->release() which is used by the
pids controller to uncharge its pid.  We want to use it to manage
iteration of dying tasks which requires putting it before
__unhash_process().  Move cgroup_release() above __exit_signal().
While this makes it uncharge before the pid is freed, pid is RCU freed
anyway and the window is very narrow.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 1803efb2922f..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -195,6 +195,7 @@ repeat:
 	rcu_read_unlock();
 
 	proc_flush_task(p);
+	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
@@ -220,7 +221,6 @@ repeat:
 	}
 
 	write_unlock_irq(&tasklist_lock);
-	cgroup_release(p);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
-- 
cgit v1.2.3


From b636fd38dc40113f853337a7d2a6885ad23b8811 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 31 May 2019 10:38:58 -0700
Subject: cgroup: Implement css_task_iter_skip()

When a task is moved out of a cset, task iterators pointing to the
task are advanced using the normal css_task_iter_advance() call.  This
is fine but we'll be tracking dying tasks on csets and thus moving
tasks from cset->tasks to (to be added) cset->dying_tasks.  When we
remove a task from cset->tasks, if we advance the iterators, they may
move over to the next cset before we had the chance to add the task
back on the dying list, which can allow the task to escape iteration.

This patch separates out skipping from advancing.  Skipping only moves
the affected iterators to the next pointer rather than fully advancing
it and the following advancing will recognize that the cursor has
already been moved forward and do the rest of advancing.  This ensures
that when a task moves from one list to another in its cset, as long
as it moves in the right direction, it's always visible to iteration.

This doesn't cause any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/cgroup/cgroup.c | 60 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..035aee466bbf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
 
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+			       struct task_struct *task);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss);
@@ -843,6 +844,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
 		cgroup_update_populated(link->cgrp, populated);
 }
 
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position.  Advancing an iterator might
+ * remove it from the list, use safe walk.  See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+				    struct task_struct *task)
+{
+	struct css_task_iter *it, *pos;
+
+	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+		css_task_iter_skip(it, task);
+}
+
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -868,22 +884,9 @@ static void css_set_move_task(struct task_struct *task,
 		css_set_update_populated(to_cset, true);
 
 	if (from_cset) {
-		struct css_task_iter *it, *pos;
-
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
-		/*
-		 * @task is leaving, advance task iterators which are
-		 * pointing to it so that they can resume at the next
-		 * position.  Advancing an iterator might remove it from
-		 * the list, use safe walk.  See css_task_iter_advance*()
-		 * for details.
-		 */
-		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
-					 iters_node)
-			if (it->task_pos == &task->cg_list)
-				css_task_iter_advance(it);
-
+		css_set_skip_task_iters(from_cset, task);
 		list_del_init(&task->cg_list);
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
@@ -4430,10 +4433,19 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 	list_add(&it->iters_node, &cset->task_iters);
 }
 
-static void css_task_iter_advance(struct css_task_iter *it)
+static void css_task_iter_skip(struct css_task_iter *it,
+			       struct task_struct *task)
 {
-	struct list_head *next;
+	lockdep_assert_held(&css_set_lock);
+
+	if (it->task_pos == &task->cg_list) {
+		it->task_pos = it->task_pos->next;
+		it->flags |= CSS_TASK_ITER_SKIPPED;
+	}
+}
 
+static void css_task_iter_advance(struct css_task_iter *it)
+{
 	lockdep_assert_held(&css_set_lock);
 repeat:
 	if (it->task_pos) {
@@ -4442,15 +4454,15 @@ repeat:
 		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
 		 * we move onto the next cset.
 		 */
-		next = it->task_pos->next;
-
-		if (next == it->tasks_head)
-			next = it->mg_tasks_head->next;
+		if (it->flags & CSS_TASK_ITER_SKIPPED)
+			it->flags &= ~CSS_TASK_ITER_SKIPPED;
+		else
+			it->task_pos = it->task_pos->next;
 
-		if (next == it->mg_tasks_head)
+		if (it->task_pos == it->tasks_head)
+			it->task_pos = it->mg_tasks_head->next;
+		if (it->task_pos == it->mg_tasks_head)
 			css_task_iter_advance_css_set(it);
-		else
-			it->task_pos = next;
 	} else {
 		/* called from start, proceed to the first cset */
 		css_task_iter_advance_css_set(it);
-- 
cgit v1.2.3


From c03cd7738a83b13739f00546166969342c8ff014 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 31 May 2019 10:38:58 -0700
Subject: cgroup: Include dying leaders with live threads in PROCS iterations

CSS_TASK_ITER_PROCS currently iterates live group leaders; however,
this means that a process with dying leader and live threads will be
skipped.  IOW, cgroup.procs might be empty while cgroup.threads isn't,
which is confusing to say the least.

Fix it by making cset track dying tasks and include dying leaders with
live threads in PROCS iteration.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Topi Miettinen <toiwoton@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/cgroup/cgroup.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 035aee466bbf..a7df319c2e9a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -739,6 +739,7 @@ struct css_set init_css_set = {
 	.dom_cset		= &init_css_set,
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.dying_tasks		= LIST_HEAD_INIT(init_css_set.dying_tasks),
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -1213,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	cset->dom_cset = cset;
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
+	INIT_LIST_HEAD(&cset->dying_tasks);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_LIST_HEAD(&cset->threaded_csets);
 	INIT_HLIST_NODE(&cset->hlist);
@@ -4399,15 +4401,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 			it->task_pos = NULL;
 			return;
 		}
-	} while (!css_set_populated(cset));
+	} while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks));
 
 	if (!list_empty(&cset->tasks))
 		it->task_pos = cset->tasks.next;
-	else
+	else if (!list_empty(&cset->mg_tasks))
 		it->task_pos = cset->mg_tasks.next;
+	else
+		it->task_pos = cset->dying_tasks.next;
 
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
+	it->dying_tasks_head = &cset->dying_tasks;
 
 	/*
 	 * We don't keep css_sets locked across iteration steps and thus
@@ -4446,6 +4451,8 @@ static void css_task_iter_skip(struct css_task_iter *it,
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
+	struct task_struct *task;
+
 	lockdep_assert_held(&css_set_lock);
 repeat:
 	if (it->task_pos) {
@@ -4462,17 +4469,32 @@ repeat:
 		if (it->task_pos == it->tasks_head)
 			it->task_pos = it->mg_tasks_head->next;
 		if (it->task_pos == it->mg_tasks_head)
+			it->task_pos = it->dying_tasks_head->next;
+		if (it->task_pos == it->dying_tasks_head)
 			css_task_iter_advance_css_set(it);
 	} else {
 		/* called from start, proceed to the first cset */
 		css_task_iter_advance_css_set(it);
 	}
 
-	/* if PROCS, skip over tasks which aren't group leaders */
-	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
-	    !thread_group_leader(list_entry(it->task_pos, struct task_struct,
-					    cg_list)))
-		goto repeat;
+	if (!it->task_pos)
+		return;
+
+	task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+	if (it->flags & CSS_TASK_ITER_PROCS) {
+		/* if PROCS, skip over tasks which aren't group leaders */
+		if (!thread_group_leader(task))
+			goto repeat;
+
+		/* and dying leaders w/o live member threads */
+		if (!atomic_read(&task->signal->live))
+			goto repeat;
+	} else {
+		/* skip all dying ones */
+		if (task->flags & PF_EXITING)
+			goto repeat;
+	}
 }
 
 /**
@@ -6009,6 +6031,7 @@ void cgroup_exit(struct task_struct *tsk)
 	if (!list_empty(&tsk->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
+		list_add_tail(&tsk->cg_list, &cset->dying_tasks);
 		cset->nr_tasks--;
 
 		WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -6034,6 +6057,13 @@ void cgroup_release(struct task_struct *task)
 	do_each_subsys_mask(ss, ssid, have_release_callback) {
 		ss->release(task);
 	} while_each_subsys_mask();
+
+	if (use_task_css_set_links) {
+		spin_lock_irq(&css_set_lock);
+		css_set_skip_task_iters(task_css_set(task), task);
+		list_del_init(&task->cg_list);
+		spin_unlock_irq(&css_set_lock);
+	}
 }
 
 void cgroup_free(struct task_struct *task)
-- 
cgit v1.2.3


From 8856ae4df3e9b5295ea2da7ad3b00796386454ec Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 31 May 2019 22:30:12 -0700
Subject: kernel/fork.c: make max_threads symbol static

Fix build warning,
kernel/fork.c:125:5: warning: symbol 'max_threads' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20190516015118.140561-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b2b87d450b80..75675b9bf6df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -123,7 +123,7 @@
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
 int nr_threads;			/* The idle threads do not count.. */
 
-int max_threads;		/* tunable limit on nr_threads */
+static int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
-- 
cgit v1.2.3


From 11bbd8b416f8abf40900dc5041152892f873d915 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Fri, 31 May 2019 22:30:16 -0700
Subject: prctl_set_mm: refactor checks from validate_prctl_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Despite comment of validate_prctl_map claims there are no capability
checks, it is not completely true since commit 4d28df6152aa ("prctl: Allow
local CAP_SYS_ADMIN changing exe_file").  Extract the check out of the
function and make the function perform purely arithmetic checks.

This patch should not change any behavior, it is mere refactoring for
following patch.

[akpm@linux-foundation.org: coding style fixes]
Link: http://lkml.kernel.org/r/20190502125203.24014-2-mkoutny@suse.com
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index bdbfe8d37418..775bf8d18d03 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1882,13 +1882,14 @@ exit_err:
 }
 
 /*
+ * Check arithmetic relations of passed addresses.
+ *
  * WARNING: we don't require any capability here so be very careful
  * in what is allowed for modification from userspace.
  */
-static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
 {
 	unsigned long mmap_max_addr = TASK_SIZE;
-	struct mm_struct *mm = current->mm;
 	int error = -EINVAL, i;
 
 	static const unsigned char offsets[] = {
@@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
 			      prctl_map->start_data))
 			goto out;
 
-	/*
-	 * Someone is trying to cheat the auxv vector.
-	 */
-	if (prctl_map->auxv_size) {
-		if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
-			goto out;
-	}
-
-	/*
-	 * Finally, make sure the caller has the rights to
-	 * change /proc/pid/exe link: only local sys admin should
-	 * be allowed to.
-	 */
-	if (prctl_map->exe_fd != (u32)-1) {
-		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
-			goto out;
-	}
-
 	error = 0;
 out:
 	return error;
@@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
 		return -EFAULT;
 
-	error = validate_prctl_map(&prctl_map);
+	error = validate_prctl_map_addr(&prctl_map);
 	if (error)
 		return error;
 
 	if (prctl_map.auxv_size) {
+		/*
+		 * Someone is trying to cheat the auxv vector.
+		 */
+		if (!prctl_map.auxv ||
+				prctl_map.auxv_size > sizeof(mm->saved_auxv))
+			return -EINVAL;
+
 		memset(user_auxv, 0, sizeof(user_auxv));
 		if (copy_from_user(user_auxv,
 				   (const void __user *)prctl_map.auxv,
@@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	}
 
 	if (prctl_map.exe_fd != (u32)-1) {
+		/*
+		 * Make sure the caller has the rights to
+		 * change /proc/pid/exe link: only local sys admin should
+		 * be allowed to.
+		 */
+		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+			return -EINVAL;
+
 		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
 		if (error)
 			return error;
@@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	struct mm_struct *mm = current->mm;
-	struct prctl_mm_map prctl_map;
+	struct prctl_mm_map prctl_map = {
+		.auxv = NULL,
+		.auxv_size = 0,
+		.exe_fd = -1,
+	};
 	struct vm_area_struct *vma;
 	int error;
 
@@ -2139,9 +2141,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	prctl_map.arg_end	= mm->arg_end;
 	prctl_map.env_start	= mm->env_start;
 	prctl_map.env_end	= mm->env_end;
-	prctl_map.auxv		= NULL;
-	prctl_map.auxv_size	= 0;
-	prctl_map.exe_fd	= -1;
 
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
@@ -2181,7 +2180,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
 		goto out;
 	}
 
-	error = validate_prctl_map(&prctl_map);
+	error = validate_prctl_map_addr(&prctl_map);
 	if (error)
 		goto out;
 
-- 
cgit v1.2.3


From bc81426f5beef7da863d3365bc9d45e820448745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Fri, 31 May 2019 22:30:19 -0700
Subject: prctl_set_mm: downgrade mmap_sem to read lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit a3b609ef9f8b ("proc read mm's {arg,env}_{start,end} with mmap
semaphore taken.") added synchronization of reading argument/environment
boundaries under mmap_sem.  Later commit 88aa7cc688d4 ("mm: introduce
arg_lock to protect arg_start|end and env_start|end in mm_struct") avoided
the coarse use of mmap_sem in similar situations.  But there still
remained two places that (mis)use mmap_sem.

get_cmdline should also use arg_lock instead of mmap_sem when it reads the
boundaries.

The second place that should use arg_lock is in prctl_set_mm.  By
protecting the boundaries fields with the arg_lock, we can downgrade
mmap_sem to reader lock (analogous to what we already do in
prctl_set_mm_map).

[akpm@linux-foundation.org: coding style fixes]
Link: http://lkml.kernel.org/r/20190502125203.24014-3-mkoutny@suse.com
Fixes: 88aa7cc688d4 ("mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Co-developed-by: Laurent Dufour <ldufour@linux.ibm.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 775bf8d18d03..2969304c29fe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2127,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,
 
 	error = -EINVAL;
 
-	down_write(&mm->mmap_sem);
+	/*
+	 * arg_lock protects concurent updates of arg boundaries, we need
+	 * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+	 * validation.
+	 */
+	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
+	spin_lock(&mm->arg_lock);
 	prctl_map.start_code	= mm->start_code;
 	prctl_map.end_code	= mm->end_code;
 	prctl_map.start_data	= mm->start_data;
@@ -2217,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
 
 	error = 0;
 out:
-	up_write(&mm->mmap_sem);
+	spin_unlock(&mm->arg_lock);
+	up_read(&mm->mmap_sem);
 	return error;
 }
 
-- 
cgit v1.2.3


From 9852ae3fe5293264f01c49f2571ef7688f7823ce Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Fri, 31 May 2019 22:30:22 -0700
Subject: mm, memcg: consider subtrees in memory.events

memory.stat and other files already consider subtrees in their output, and
we should too in order to not present an inconsistent interface.

The current situation is fairly confusing, because people interacting with
cgroups expect hierarchical behaviour in the vein of memory.stat,
cgroup.events, and other files.  For example, this causes confusion when
debugging reclaim events under low, as currently these always read "0" at
non-leaf memcg nodes, which frequently causes people to misdiagnose breach
behaviour.  The same confusion applies to other counters in this file when
debugging issues.

Aggregation is done at write time instead of at read-time since these
counters aren't hot (unlike memory.stat which is per-page, so it does it
at read time), and it makes sense to bundle this with the file
notifications.

After this patch, events are propagated up the hierarchy:

    [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
    low 0
    high 0
    max 0
    oom 0
    oom_kill 0
    [root@ktst ~]# systemd-run -p MemoryMax=1 true
    Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service
    [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events
    low 0
    high 0
    max 7
    oom 1
    oom_kill 1

As this is a change in behaviour, this can be reverted to the old
behaviour by mounting with the `memory_localevents' flag set.  However, we
use the new behaviour by default as there's a lack of evidence that there
are any current users of memory.events that would find this change
undesirable.

akpm: this is a behaviour change, so Cc:stable.  THis is so that
forthcoming distros which use cgroup v2 are more likely to pick up the
revised behaviour.

Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cgroup.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..426a0026225c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1810,11 +1810,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 enum cgroup2_param {
 	Opt_nsdelegate,
+	Opt_memory_localevents,
 	nr__cgroup2_params
 };
 
 static const struct fs_parameter_spec cgroup2_param_specs[] = {
-	fsparam_flag  ("nsdelegate",		Opt_nsdelegate),
+	fsparam_flag("nsdelegate",		Opt_nsdelegate),
+	fsparam_flag("memory_localevents",	Opt_memory_localevents),
 	{}
 };
 
@@ -1837,6 +1839,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_nsdelegate:
 		ctx->flags |= CGRP_ROOT_NS_DELEGATE;
 		return 0;
+	case Opt_memory_localevents:
+		ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+		return 0;
 	}
 	return -EINVAL;
 }
@@ -1848,6 +1853,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
 			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
 		else
 			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+		if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
 	}
 }
 
@@ -1855,6 +1865,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 {
 	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
 		seq_puts(seq, ",nsdelegate");
+	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+		seq_puts(seq, ",memory_localevents");
 	return 0;
 }
 
@@ -6325,7 +6337,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
 			     char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+	return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
-- 
cgit v1.2.3


From 98af37d624ed8c83f1953b1b6b2f6866011fc064 Mon Sep 17 00:00:00 2001
From: Zhenliang Wei <weizhenliang@huawei.com>
Date: Fri, 31 May 2019 22:30:52 -0700
Subject: kernel/signal.c: trace_signal_deliver when signal_group_exit

In the fixes commit, removing SIGKILL from each thread signal mask and
executing "goto fatal" directly will skip the call to
"trace_signal_deliver".  At this point, the delivery tracking of the
SIGKILL signal will be inaccurate.

Therefore, we need to add trace_signal_deliver before "goto fatal" after
executing sigdelset.

Note: SEND_SIG_NOINFO matches the fact that SIGKILL doesn't have any info.

Link: http://lkml.kernel.org/r/20190425025812.91424-1-weizhenliang@huawei.com
Fixes: cf43a757fd4944 ("signal: Restore the stop PTRACE_EVENT_EXIT")
Signed-off-by: Zhenliang Wei <weizhenliang@huawei.com>
Reviewed-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ivan Delalande <colona@arista.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d7b9d14ac80d..328a01e1a2f0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2485,6 +2485,8 @@ relock:
 	if (signal_group_exit(signal)) {
 		ksig->info.si_signo = signr = SIGKILL;
 		sigdelset(&current->pending.signal, SIGKILL);
+		trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+				&sighand->action[SIGKILL - 1]);
 		recalc_sigpending();
 		goto fatal;
 	}
-- 
cgit v1.2.3


From a61373476127edac8bcc5ee9d68a74dc1b864f53 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 27 May 2019 12:45:18 +0200
Subject: PM: sleep: Add kerneldoc comments to some functions

Add kerneldoc comments to pm_suspend_via_firmware(),
pm_resume_via_firmware() and pm_suspend_via_s2idle()
to explain what they do.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ef908c134b34..43d869db6c07 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -62,6 +62,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
 enum s2idle_states __read_mostly s2idle_state;
 static DEFINE_RAW_SPINLOCK(s2idle_lock);
 
+/**
+ * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend.
+ *
+ * Return 'true' if suspend-to-idle has been selected as the default system
+ * suspend method.
+ */
 bool pm_suspend_via_s2idle(void)
 {
 	return mem_sleep_current == PM_SUSPEND_TO_IDLE;
-- 
cgit v1.2.3


From ec527c318036a65a083ef68d8ba95789d2212246 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 30 May 2019 00:09:39 +0200
Subject: x86/power: Fix 'nosmt' vs hibernation triple fault during resume

As explained in

	0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")

we always, no matter what, have to bring up x86 HT siblings during boot at
least once in order to avoid first MCE bringing the system to its knees.

That means that whenever 'nosmt' is supplied on the kernel command-line,
all the HT siblings are as a result sitting in mwait or cpudile after
going through the online-offline cycle at least once.

This causes a serious issue though when a kernel, which saw 'nosmt' on its
commandline, is going to perform resume from hibernation: if the resume
from the hibernated image is successful, cr3 is flipped in order to point
to the address space of the kernel that is being resumed, which in turn
means that all the HT siblings are all of a sudden mwaiting on address
which is no longer valid.

That results in triple fault shortly after cr3 is switched, and machine
reboots.

Fix this by always waking up all the SMT siblings before initiating the
'restore from hibernation' process; this guarantees that all the HT
siblings will be properly carried over to the resumed kernel waiting in
resume_play_dead(), and acted upon accordingly afterwards, based on the
target kernel configuration.

Symmetricaly, the resumed kernel has to push the SMT siblings to mwait
again in case it has SMT disabled; this means it has to online all
the siblings when resuming (so that they come out of hlt) and offline
them again to let them reach mwait.

Cc: 4.19+ <stable@vger.kernel.org> # v4.19+
Debugged-by: Thomas Gleixner <tglx@linutronix.de>
Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/cpu.c             | 4 ++--
 kernel/power/hibernate.c | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2ef10460698..077fde6fb953 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2061,7 +2061,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
 	kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 }
 
-static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 {
 	int cpu, ret = 0;
 
@@ -2093,7 +2093,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 	return ret;
 }
 
-static int cpuhp_smt_enable(void)
+int cpuhp_smt_enable(void)
 {
 	int cpu, ret = 0;
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8c272df7154..b65635753e8e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -257,6 +257,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
 		(kps % 1000) / 10);
 }
 
+__weak int arch_resume_nosmt(void)
+{
+	return 0;
+}
+
 /**
  * create_image - Create a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
@@ -324,6 +329,10 @@ static int create_image(int platform_mode)
  Enable_cpus:
 	suspend_enable_secondary_cpus();
 
+	/* Allow architectures to do nosmt-specific post-resume dances */
+	if (!in_suspend)
+		error = arch_resume_nosmt();
+
  Platform_finish:
 	platform_finish(platform_mode);
 
-- 
cgit v1.2.3


From c732327f04a3818f35fa97d07b1d64d31b691d78 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 4 Jun 2019 15:18:43 +0200
Subject: signal: improve comments

Improve the comments for pidfd_send_signal().
First, the comment still referred to a file descriptor for a process as a
"task file descriptor" which stems from way back at the beginning of the
discussion. Replace this with "pidfd" for consistency.
Second, the wording for the explanation of the arguments to the syscall
was a bit inconsistent, e.g. some used the past tense some used present
tense. Make the wording more consistent.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
 kernel/signal.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 328a01e1a2f0..d622eac9d169 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3621,12 +3621,11 @@ static struct pid *pidfd_to_pid(const struct file *file)
 }
 
 /**
- * sys_pidfd_send_signal - send a signal to a process through a task file
- *                          descriptor
- * @pidfd:  the file descriptor of the process
- * @sig:    signal to be sent
- * @info:   the signal info
- * @flags:  future flags to be passed
+ * sys_pidfd_send_signal - Signal a process through a pidfd
+ * @pidfd:  file descriptor of the process
+ * @sig:    signal to send
+ * @info:   signal info
+ * @flags:  future flags
  *
  * The syscall currently only signals via PIDTYPE_PID which covers
  * kill(<positive-pid>, <signal>. It does not signal threads or process
-- 
cgit v1.2.3


From 9c92ab61914157664a2fbdf926df0eb937838e45 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 May 2019 07:17:56 -0700
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 282

Based on 1 normalized pattern(s):

  this software is licensed under the terms of the gnu general public
  license version 2 as published by the free software foundation and
  may be copied distributed and modified under those terms this
  program is distributed in the hope that it will be useful but
  without any warranty without even the implied warranty of
  merchantability or fitness for a particular purpose see the gnu
  general public license for more details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 285 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190529141900.642774971@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cpu_pm.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2011 Google, Inc.
  *
  * Author:
  *	Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 5b497af42fab12cadc0e29bcb7052cf9963603f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 May 2019 07:18:09 -0700
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 295

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of version 2 of the gnu general public license as
  published by the free software foundation this program is
  distributed in the hope that it will be useful but without any
  warranty without even the implied warranty of merchantability or
  fitness for a particular purpose see the gnu general public license
  for more details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 64 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190529141901.894819585@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/arraymap.c | 10 +---------
 kernel/bpf/devmap.c   | 10 +---------
 kernel/bpf/disasm.c   | 10 +---------
 kernel/bpf/disasm.h   | 10 +---------
 kernel/bpf/hashtab.c  | 10 +---------
 kernel/bpf/helpers.c  | 10 +---------
 kernel/bpf/syscall.c  | 10 +---------
 kernel/bpf/verifier.c | 10 +---------
 8 files changed, 8 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..262a321f58a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016,2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 #include <linux/bpf.h>
 #include <linux/btf.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1e525d70f833..15dbc15c5b0c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 
 /* Devmaps primary use is as a backend map for XDP BPF helper call
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d9ce383c0f9c..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 
 #include <linux/bpf.h>
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 
 #ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0f2708fde5f7..583df5cb302d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 #include <linux/bpf.h>
 #include <linux/btf.h>
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4266ffde07ca..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 #include <linux/bpf.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..ef63d26622f2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..d15cc4fafa89 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
  * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
  */
 #include <uapi/linux/btf.h>
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 4505153954fdb1465d2b178288a9bf646f2a2166 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 May 2019 16:57:47 -0700
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 333

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license version 2 as
  published by the free software foundation this program is
  distributed in the hope that it will be useful but without any
  warranty without even the implied warranty of merchantability or
  fitness for a particular purpose see the gnu general public license
  for more details you should have received a copy of the gnu general
  public license along with this program if not write to the free
  software foundation inc 59 temple place suite 330 boston ma 02111
  1307 usa

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 136 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190530000436.384967451@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/dma/debug.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index badd77670d00..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2008 Advanced Micro Devices, Inc.
  *
  * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
 #define pr_fmt(fmt)	"DMA-API: " fmt
-- 
cgit v1.2.3


From ddc64d0ac97814fcc42ed90a2ea0c69658806c67 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 31 May 2019 01:09:24 -0700
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 363

Based on 1 normalized pattern(s):

  released under terms in gpl version 2 see copying

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 5 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190531081035.689962394@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/cpumap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..8ebd0fa826f8 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* bpf/cpumap.c
  *
  * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2.  See COPYING.
  */
 
 /* The 'cpumap' is primarily used as a backend map for XDP BPF helper
-- 
cgit v1.2.3


From 55716d26439f5c4008b0bcb7f17d1f7c0d8fbcfc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 Jun 2019 10:08:42 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 428

Based on 1 normalized pattern(s):

  this file is released under the gplv2

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 68 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190531190114.292346262@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/power/hibernate.c    | 3 +--
 kernel/power/main.c         | 4 +---
 kernel/power/snapshot.c     | 4 +---
 kernel/power/suspend.c      | 3 +--
 kernel/power/suspend_test.c | 3 +--
 kernel/power/swap.c         | 4 +---
 kernel/power/user.c         | 4 +---
 7 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 97522630b1b6..8fc054e5c501 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
  *
@@ -6,8 +7,6 @@
  * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
  * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
  */
 
 #define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4f43e724f6eb..bdbd605c4215 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/power/main.c - PM subsystem core functionality.
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
- *
- * This file is released under the GPLv2
- *
  */
 
 #include <linux/export.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bc9558ab1e5b..83105874f255 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/kernel/power/snapshot.c
  *
@@ -5,9 +6,6 @@
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
  */
 
 #define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ef908c134b34..3c57e206f3e8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/power/suspend.c - Suspend to RAM and standby functionality.
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
  * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
  */
 
 #define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 6a897e8b2a88..60564b58de07 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
  *
  * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
- *
- * This file is released under the GPLv2.
  */
 
 #include <linux/init.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..e1912ad13bdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/kernel/power/swap.c
  *
@@ -7,9 +8,6 @@
  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
- *
  */
 
 #define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/user.c b/kernel/power/user.c
index cb24e840a3e6..77438954cc2b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * linux/kernel/power/user.c
  *
  * This file provides the user space interface for software suspend/resume.
  *
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
  */
 
 #include <linux/suspend.h>
-- 
cgit v1.2.3


From 767a67b0b35520348dc3b28dcba06454b0f9023d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 Jun 2019 10:08:44 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 430

Based on 1 normalized pattern(s):

  distribute under gplv2

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 8 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190531190114.475576622@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/softirq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c3382378d94..a6b81c6b6bff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *	linux/kernel/softirq.c
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
- *	Distribute under GPLv2.
- *
  *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  */
 
-- 
cgit v1.2.3


From 3e45610181bcc2c68d343a62c1d028890160ef79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 Jun 2019 10:08:50 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 436

Based on 1 normalized pattern(s):

  distributed under the terms of the gnu gpl version 2

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 2 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190531190115.032570679@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/locking/semaphore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..d9dd94defc0a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2008 Intel Corporation
  * Author: Matthew Wilcox <willy@linux.intel.com>
  *
- * Distributed under the terms of the GNU GPL, version 2
- *
  * This file implements counting semaphores.
  * A counting semaphore may be acquired 'n' times before sleeping.
  * See mutex.c for single-acquisition sleeping locks which enforce
-- 
cgit v1.2.3


From b886d83c5b621abc84ff9616f14c529be3f6b147 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 Jun 2019 10:08:55 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 441

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation version 2 of the license

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 315 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190531190115.503150771@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/async.c             | 6 +-----
 kernel/backtracetest.c     | 6 +-----
 kernel/latencytop.c        | 6 +-----
 kernel/nsproxy.c           | 6 +-----
 kernel/sched/cpudeadline.c | 6 +-----
 kernel/sched/cpupri.c      | 6 +-----
 kernel/ucount.c            | 7 +------
 kernel/user_namespace.c    | 7 +------
 kernel/utsname.c           | 6 +-----
 kernel/utsname_sysctl.c    | 6 +-----
 10 files changed, 10 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 12c332e4e13e..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * async.c: Asynchronous function calls for boot performance
  *
  * (C) Copyright 2009 Intel Corporation
  * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
 
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a563c8fdad0d..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Simple stack backtrace regression test module
  *
  * (C) Copyright 2008 Intel Corporation
  * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
 #include <linux/completion.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 871734ea2f04..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * latencytop.c: Latency display infrastructure
  *
  * (C) Copyright 2008 Intel Corporation
  * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
  */
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..c815f58e6bc0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Copyright (C) 2006 IBM Corporation
  *
  *  Author: Serge Hallyn <serue@us.ibm.com>
  *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- *
  *  Jun 2006 - namespaces support
  *             OpenVZ, SWsoft Inc.
  *             Pavel Emelianov <xemul@openvz.org>
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..ec4e4a9aab5f 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  kernel/sched/cpudl.c
  *
  *  Global CPU deadline management
  *
  *  Author: Juri Lelli <j.lelli@sssup.it>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
  */
 #include "sched.h"
 
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..9c6480e6d62d 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  kernel/sched/cpupri.c
  *
@@ -20,11 +21,6 @@
  *  searches).  For tasks with affinity restrictions, the algorithm has a
  *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
  *  yields the worst case search is fairly contrived.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
  */
 #include "sched.h"
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f48d1b6376a4..feb128c7b5d9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -1,9 +1,4 @@
-/*
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/stat.h>
 #include <linux/sysctl.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..0eff45ce7703 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1,9 +1,4 @@
-/*
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/export.h>
 #include <linux/nsproxy.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index dcd6be1996fe..f0e491193009 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Copyright (C) 2004 IBM Corporation
  *
  *  Author: Serge Hallyn <serue@us.ibm.com>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
  */
 
 #include <linux/export.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 258033d62cb3..3732c888a949 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Copyright (C) 2007
  *
  *  Author: Eric Biederman <ebiederm@xmision.com>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
  */
 
 #include <linux/export.h>
-- 
cgit v1.2.3


From cee0c33c546a93957a52ae9ab6bebadbee765ec5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 5 Jun 2019 09:54:34 -0700
Subject: cgroup: css_task_iter_skip()'d iterators must be advanced before
 accessed

b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") introduced
css_task_iter_skip() which is used to fix task iterations skipping
dying threadgroup leaders with live threads.  Skipping is implemented
as a subportion of full advancing but css_task_iter_next() forgot to
fully advance a skipped iterator before determining the next task to
visit causing it to return invalid task pointers.

Fix it by making css_task_iter_next() fully advance the iterator if it
has been skipped since the previous iteration.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: syzbot
Link: http://lkml.kernel.org/r/00000000000097025d058a7fd785@google.com
Fixes: b636fd38dc40 ("cgroup: Implement css_task_iter_skip()")
---
 kernel/cgroup/cgroup.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a7df319c2e9a..9538a12d42d6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4550,6 +4550,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 
 	spin_lock_irq(&css_set_lock);
 
+	/* @it may be half-advanced by skips, finish advancing */
+	if (it->flags & CSS_TASK_ITER_SKIPPED)
+		css_task_iter_advance(it);
+
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
 					  cg_list);
-- 
cgit v1.2.3


From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 7 Jun 2019 01:48:57 +0200
Subject: bpf: fix unconnected udp hooks

Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently
to applications as also stated in original motivation in 7828f20e3779 ("Merge
branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter
two hooks into Cilium to enable host based load-balancing with Kubernetes,
I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes
typically sets up DNS as a service and is thus subject to load-balancing.

Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API
is currently insufficient and thus not usable as-is for standard applications
shipped with most distros. To break down the issue we ran into with a simple
example:

  # cat /etc/resolv.conf
  nameserver 147.75.207.207
  nameserver 147.75.207.208

For the purpose of a simple test, we set up above IPs as service IPs and
transparently redirect traffic to a different DNS backend server for that
node:

  # cilium service list
  ID   Frontend            Backend
  1    147.75.207.207:53   1 => 8.8.8.8:53
  2    147.75.207.208:53   1 => 8.8.8.8:53

The attached BPF program is basically selecting one of the backends if the
service IP/port matches on the cgroup hook. DNS breaks here, because the
hooks are not transparent enough to applications which have built-in msg_name
address checks:

  # nslookup 1.1.1.1
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  [...]
  ;; connection timed out; no servers could be reached

  # dig 1.1.1.1
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  [...]

  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
  ;; global options: +cmd
  ;; connection timed out; no servers could be reached

For comparison, if none of the service IPs is used, and we tell nslookup
to use 8.8.8.8 directly it works just fine, of course:

  # nslookup 1.1.1.1 8.8.8.8
  1.1.1.1.in-addr.arpa	name = one.one.one.one.

In order to fix this and thus act more transparent to the application,
this needs reverse translation on recvmsg() side. A minimal fix for this
API is to add similar recvmsg() hooks behind the BPF cgroups static key
such that the program can track state and replace the current sockaddr_in{,6}
with the original service IP. From BPF side, this basically tracks the
service tuple plus socket cookie in an LRU map where the reverse NAT can
then be retrieved via map value as one example. Side-note: the BPF cgroups
static key should be converted to a per-hook static key in future.

Same example after this fix:

  # cilium service list
  ID   Frontend            Backend
  1    147.75.207.207:53   1 => 8.8.8.8:53
  2    147.75.207.208:53   1 => 8.8.8.8:53

Lookups work fine now:

  # nslookup 1.1.1.1
  1.1.1.1.in-addr.arpa    name = one.one.one.one.

  Authoritative answers can be found from:

  # dig 1.1.1.1

  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
  ;; global options: +cmd
  ;; Got answer:
  ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550
  ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1

  ;; OPT PSEUDOSECTION:
  ; EDNS: version: 0, flags:; udp: 512
  ;; QUESTION SECTION:
  ;1.1.1.1.                       IN      A

  ;; AUTHORITY SECTION:
  .                       23426   IN      SOA     a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400

  ;; Query time: 17 msec
  ;; SERVER: 147.75.207.207#53(147.75.207.207)
  ;; WHEN: Tue May 21 12:59:38 UTC 2019
  ;; MSG SIZE  rcvd: 111

And from an actual packet level it shows that we're using the back end
server when talking via 147.75.207.20{7,8} front end:

  # tcpdump -i any udp
  [...]
  12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
  12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
  [...]

In order to be flexible and to have same semantics as in sendmsg BPF
programs, we only allow return codes in [1,1] range. In the sendmsg case
the program is called if msg->msg_name is present which can be the case
in both, connected and unconnected UDP.

The former only relies on the sockaddr_in{,6} passed via connect(2) if
passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar
way to call into the BPF program whenever a non-NULL msg->msg_name was
passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note
that for TCP case, the msg->msg_name is ignored in the regular recvmsg
path and therefore not relevant.

For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE,
the hook is not called. This is intentional as it aligns with the same
semantics as in case of TCP cgroup BPF hooks right now. This might be
better addressed in future through a different bpf_attach_type such
that this case can be distinguished from the regular recvmsg paths,
for example.

Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c  |  8 ++++++++
 kernel/bpf/verifier.c | 12 ++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..e8ba3a153691 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_CONNECT:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UDP4_RECVMSG:
+		case BPF_CGROUP_UDP6_RECVMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..d2c8a6677ac4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5361,9 +5361,12 @@ static int check_return_code(struct bpf_verifier_env *env)
 	struct tnum range = tnum_range(0, 1);
 
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+			range = tnum_range(1, 1);
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
-	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
@@ -5380,16 +5383,17 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
+		char tn_buf[48];
+
 		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
-			char tn_buf[48];
-
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 			verbose(env, "has value %s", tn_buf);
 		} else {
 			verbose(env, "has unknown scalar value");
 		}
-		verbose(env, " should have been 0 or 1\n");
+		tnum_strn(tn_buf, sizeof(tn_buf), range);
+		verbose(env, " should have been in %s\n", tn_buf);
 		return -EINVAL;
 	}
 	return 0;
-- 
cgit v1.2.3


From 54b7b868e826b294687c439b68ec55fe20cafe5b Mon Sep 17 00:00:00 2001
From: Angelo Ruocco <angeloruocco90@gmail.com>
Date: Tue, 21 May 2019 10:01:54 +0200
Subject: cgroup: let a symlink too be created with a cftype file

This commit enables a cftype to have a symlink (of any name) that
points to the file associated with the cftype.

Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/cgroup/cgroup.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 426a0026225c..155048b0eca2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 
-static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
-			      char *buf)
+static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf, bool write_link_name)
 {
 	struct cgroup_subsys *ss = cft->ss;
 
@@ -1471,13 +1471,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
 			 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
-			 cft->name);
+			 write_link_name ? cft->link_name : cft->name);
 	} else {
-		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+		strscpy(buf, write_link_name ? cft->link_name : cft->name,
+			CGROUP_FILE_NAME_MAX);
 	}
 	return buf;
 }
 
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf)
+{
+	return cgroup_fill_name(cgrp, cft, buf, false);
+}
+
+static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf)
+{
+	return cgroup_fill_name(cgrp, cft, buf, true);
+}
+
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
@@ -1636,6 +1649,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 	}
 
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+	if (cft->flags & CFTYPE_SYMLINKED)
+		kernfs_remove_by_name(cgrp->kn,
+				      cgroup_link_name(cgrp, cft, name));
 }
 
 /**
@@ -3821,6 +3837,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
+	struct kernfs_node *kn_link;
 	struct lock_class_key *key = NULL;
 	int ret;
 
@@ -3851,6 +3868,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 		spin_unlock_irq(&cgroup_file_kn_lock);
 	}
 
+	if (cft->flags & CFTYPE_SYMLINKED) {
+		kn_link = kernfs_create_link(cgrp->kn,
+					     cgroup_link_name(cgrp, cft, name),
+					     kn);
+		if (IS_ERR(kn_link))
+			return PTR_ERR(kn_link);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From cf8929885de318c0bf73438c9e5dde59d6536f7c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Jun 2019 03:35:41 -0600
Subject: cgroup/bfq: revert bfq.weight symlink change

There's some discussion on how to do this the best, and Tejun prefers
that BFQ just create the file itself instead of having cgroups support
a symlink feature.

Hence revert commit 54b7b868e826 and 19e9da9e86c4 for 5.2, and this
can be done properly for 5.3.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/cgroup/cgroup.c | 33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 155048b0eca2..426a0026225c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 
-static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
-			      char *buf, bool write_link_name)
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf)
 {
 	struct cgroup_subsys *ss = cft->ss;
 
@@ -1471,26 +1471,13 @@ static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
 
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
 			 dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
-			 write_link_name ? cft->link_name : cft->name);
+			 cft->name);
 	} else {
-		strscpy(buf, write_link_name ? cft->link_name : cft->name,
-			CGROUP_FILE_NAME_MAX);
+		strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	}
 	return buf;
 }
 
-static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
-			      char *buf)
-{
-	return cgroup_fill_name(cgrp, cft, buf, false);
-}
-
-static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft,
-			      char *buf)
-{
-	return cgroup_fill_name(cgrp, cft, buf, true);
-}
-
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
@@ -1649,9 +1636,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 	}
 
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
-	if (cft->flags & CFTYPE_SYMLINKED)
-		kernfs_remove_by_name(cgrp->kn,
-				      cgroup_link_name(cgrp, cft, name));
 }
 
 /**
@@ -3837,7 +3821,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
-	struct kernfs_node *kn_link;
 	struct lock_class_key *key = NULL;
 	int ret;
 
@@ -3868,14 +3851,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 		spin_unlock_irq(&cgroup_file_kn_lock);
 	}
 
-	if (cft->flags & CFTYPE_SYMLINKED) {
-		kn_link = kernfs_create_link(cgrp->kn,
-					     cgroup_link_name(cgrp, cft, name),
-					     kn);
-		if (IS_ERR(kn_link))
-			return PTR_ERR(kn_link);
-	}
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From c596687a008b579c503afb7a64fcacc7270fae9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Jun 2019 09:08:27 -0700
Subject: cgroup: Fix css_task_iter_advance_css_set() cset skip condition

While adding handling for dying task group leaders c03cd7738a83
("cgroup: Include dying leaders with live threads in PROCS
iterations") added an inverted cset skip condition to
css_task_iter_advance_css_set().  It should skip cset if it's
completely empty but was incorrectly testing for the inverse condition
for the dying_tasks list.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
Reported-by: syzbot+d4bba5ccd4f9a2a68681@syzkaller.appspotmail.com
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9538a12d42d6..6420ff87d72c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4401,7 +4401,7 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 			it->task_pos = NULL;
 			return;
 		}
-	} while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks));
+	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
 
 	if (!list_empty(&cset->tasks))
 		it->task_pos = cset->tasks.next;
-- 
cgit v1.2.3


From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Sat, 8 Jun 2019 12:54:19 -0700
Subject: bpf: lpm_trie: check left child of last leftmost node for NULL

If the leftmost parent node of the tree has does not have a child
on the left side, then trie_get_next_key (and bpftool map dump) will
not look at the child on the right.  This leads to the traversal
missing elements.

Lookup is not affected.

Update selftest to handle this case.

Reproducer:

 bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \
     value 1 entries 256 name test_lpm flags 1
 bpftool map update pinned /sys/fs/bpf/lpm key  8 0 0 0  0   0 value 1
 bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0  0 128 value 2
 bpftool map dump   pinned /sys/fs/bpf/lpm

Returns only 1 element. (2 expected)

Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE")
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/lpm_trie.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..864e2a496376 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -716,9 +716,14 @@ find_leftmost:
 	 * have exact two children, so this function will never return NULL.
 	 */
 	for (node = search_root; node;) {
-		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+		if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+			node = rcu_dereference(node->child[0]);
+		} else {
 			next_node = node;
-		node = rcu_dereference(node->child[0]);
+			node = rcu_dereference(node->child[0]);
+			if (!node)
+				node = rcu_dereference(next_node->child[1]);
+		}
 	}
 do_copy:
 	next_key->prefixlen = next_node->prefixlen;
-- 
cgit v1.2.3


From f6581f5b55141a95657ef5742cf6a6bfa20a109f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 29 May 2019 13:31:57 +0200
Subject: ptrace: restore smp_rmb() in __ptrace_may_access()

Restore the read memory barrier in __ptrace_may_access() that was deleted
a couple years ago. Also add comments on this barrier and the one it pairs
with to explain why they're there (as far as I understand).

Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")
Cc: stable@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/cred.c   |  9 +++++++++
 kernel/ptrace.c | 10 ++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..07e069d00696 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -450,6 +450,15 @@ int commit_creds(struct cred *new)
 		if (task->mm)
 			set_dumpable(task->mm, suid_dumpable);
 		task->pdeath_signal = 0;
+		/*
+		 * If a task drops privileges and becomes nondumpable,
+		 * the dumpability change must become visible before
+		 * the credential change; otherwise, a __ptrace_may_access()
+		 * racing with this change may be able to attach to a task it
+		 * shouldn't be able to attach to (as if the task had dropped
+		 * privileges without becoming nondumpable).
+		 * Pairs with a read barrier in __ptrace_may_access().
+		 */
 		smp_wmb();
 	}
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 02c6528ead5c..c9b4646ad375 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -323,6 +323,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return -EPERM;
 ok:
 	rcu_read_unlock();
+	/*
+	 * If a task drops privileges and becomes nondumpable (through a syscall
+	 * like setresuid()) while we are trying to access it, we must ensure
+	 * that the dumpability is read after the credentials; otherwise,
+	 * we may be able to attach to a task that we shouldn't be able to
+	 * attach to (as if the task had dropped privileges without becoming
+	 * nondumpable).
+	 * Pairs with a write barrier in commit_creds().
+	 */
+	smp_rmb();
 	mm = task->mm;
 	if (mm &&
 	    ((get_dumpable(mm) != SUID_DUMP_USER) &&
-- 
cgit v1.2.3


From d477f8c202d1f0d4791ab1263ca7657bbe5cf79e Mon Sep 17 00:00:00 2001
From: Joel Savitz <jsavitz@redhat.com>
Date: Wed, 12 Jun 2019 11:50:48 -0400
Subject: cpuset: restore sanity to cpuset_cpus_allowed_fallback()

In the case that a process is constrained by taskset(1) (i.e.
sched_setaffinity(2)) to a subset of available cpus, and all of those are
subsequently offlined, the scheduler will set tsk->cpus_allowed to
the current value of task_cs(tsk)->effective_cpus.

This is done via a call to do_set_cpus_allowed() in the context of
cpuset_cpus_allowed_fallback() made by the scheduler when this case is
detected. This is the only call made to cpuset_cpus_allowed_fallback()
in the latest mainline kernel.

However, this is not sane behavior.

I will demonstrate this on a system running the latest upstream kernel
with the following initial configuration:

	# grep -i cpu /proc/$$/status
	Cpus_allowed:	ffffffff,fffffff
	Cpus_allowed_list:	0-63

(Where cpus 32-63 are provided via smt.)

If we limit our current shell process to cpu2 only and then offline it
and reonline it:

	# taskset -p 4 $$
	pid 2272's current affinity mask: ffffffffffffffff
	pid 2272's new affinity mask: 4

	# echo off > /sys/devices/system/cpu/cpu2/online
	# dmesg | tail -3
	[ 2195.866089] process 2272 (bash) no longer affine to cpu2
	[ 2195.872700] IRQ 114: no longer affine to CPU2
	[ 2195.879128] smpboot: CPU 2 is now offline

	# echo on > /sys/devices/system/cpu/cpu2/online
	# dmesg | tail -1
	[ 2617.043572] smpboot: Booting Node 0 Processor 2 APIC 0x4

We see that our current process now has an affinity mask containing
every cpu available on the system _except_ the one we originally
constrained it to:

	# grep -i cpu /proc/$$/status
	Cpus_allowed:   ffffffff,fffffffb
	Cpus_allowed_list:      0-1,3-63

This is not sane behavior, as the scheduler can now not only place the
process on previously forbidden cpus, it can't even schedule it on
the cpu it was originally constrained to!

Other cases result in even more exotic affinity masks. Take for instance
a process with an affinity mask containing only cpus provided by smt at
the moment that smt is toggled, in a configuration such as the following:

	# taskset -p f000000000 $$
	# grep -i cpu /proc/$$/status
	Cpus_allowed:	000000f0,00000000
	Cpus_allowed_list:	36-39

A double toggle of smt results in the following behavior:

	# echo off > /sys/devices/system/cpu/smt/control
	# echo on > /sys/devices/system/cpu/smt/control
	# grep -i cpus /proc/$$/status
	Cpus_allowed:	ffffff00,ffffffff
	Cpus_allowed_list:	0-31,40-63

This is even less sane than the previous case, as the new affinity mask
excludes all smt-provided cpus with ids less than those that were
previously in the affinity mask, as well as those that were actually in
the mask.

With this patch applied, both of these cases end in the following state:

	# grep -i cpu /proc/$$/status
	Cpus_allowed:	ffffffff,ffffffff
	Cpus_allowed_list:	0-63

The original policy is discarded. Though not ideal, it is the simplest way
to restore sanity to this fallback case without reinventing the cpuset
wheel that rolls down the kernel just fine in cgroup v2. A user who wishes
for the previous affinity mask to be restored in this fallback case can use
that mechanism instead.

This patch modifies scheduler behavior by instead resetting the mask to
task_cs(tsk)->cpus_allowed by default, and cpu_possible mask in legacy
mode. I tested the cases above on both modes.

Note that the scheduler uses this fallback mechanism if and only if
_every_ other valid avenue has been traveled, and it is the last resort
before calling BUG().

Suggested-by: Waiman Long <longman@redhat.com>
Suggested-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Acked-by: Phil Auld <pauld@redhat.com>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..515525ff1cfd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	spin_unlock_irqrestore(&callback_lock, flags);
 }
 
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
+
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
 	rcu_read_lock();
-	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
 	rcu_read_unlock();
 
 	/*
-- 
cgit v1.2.3


From 2e3f139e8ecebf177fe01299285a56855e93fb84 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Jun 2019 15:56:21 -0700
Subject: mm/devm_memremap_pages: introduce devm_memunmap_pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the new devm_release_action() facility to allow
devm_memremap_pages_release() to be manually triggered.

Link: http://lkml.kernel.org/r/155727337088.292046.5774214552136776763.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 1490e63f69a9..715b434bd316 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -271,6 +271,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
 
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+	devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	/* number of pfns from base where pfn_to_page() is valid */
-- 
cgit v1.2.3


From 50f44ee7248ad2f7984ef081974a6ecd09724b3e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 13 Jun 2019 15:56:33 -0700
Subject: mm/devm_memremap_pages: fix final page put race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Logan noticed that devm_memremap_pages_release() kills the percpu_ref
drops all the page references that were acquired at init and then
immediately proceeds to unplug, arch_remove_memory(), the backing pages
for the pagemap.  If for some reason device shutdown actually collides
with a busy / elevated-ref-count page then arch_remove_memory() should
be deferred until after that reference is dropped.

As it stands the "wait for last page ref drop" happens *after*
devm_memremap_pages_release() returns, which is obviously too late and
can lead to crashes.

Fix this situation by assigning the responsibility to wait for the
percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup()
callback.  Implement the new cleanup callback for all
devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma.

Link: http://lkml.kernel.org/r/155727339156.292046.5432007428235387859.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 41e94a851304 ("add devm_memremap_pages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 715b434bd316..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
 	pgmap->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
+	pgmap->cleanup(pgmap->ref);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
  *
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
- *    devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
+ *    at devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
-	if (!pgmap->ref || !pgmap->kill)
+	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
+	}
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (conflict_pgmap) {
 		dev_WARN(dev, "Conflicting mapping in same section\n");
 		put_dev_pagemap(conflict_pgmap);
-		return ERR_PTR(-ENOMEM);
+		error = -ENOMEM;
+		goto err_array;
 	}
 
 	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
 	if (conflict_pgmap) {
 		dev_WARN(dev, "Conflicting mapping in same section\n");
 		put_dev_pagemap(conflict_pgmap);
-		return ERR_PTR(-ENOMEM);
+		error = -ENOMEM;
+		goto err_array;
 	}
 
 	is_ram = region_intersects(align_start, align_size,
@@ -267,6 +272,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgmap_array_delete(res);
  err_array:
 	pgmap->kill(pgmap->ref);
+	pgmap->cleanup(pgmap->ref);
+
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
-- 
cgit v1.2.3


From e3ff9c3678b4d80e22d2557b68726174578eaf52 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Jun 2019 21:40:45 +0200
Subject: timekeeping: Repair ktime_get_coarse*() granularity

Jason reported that the coarse ktime based time getters advance only once
per second and not once per tick as advertised.

The code reads only the monotonic base time, which advances once per
second. The nanoseconds are accumulated on every tick in xtime_nsec up to
a second and the regular time getters take this nanoseconds offset into
account, but the ktime_get_coarse*() implementation fails to do so.

Add the accumulated xtime_nsec value to the monotonic base time to get the
proper per tick advancing coarse tinme.

Fixes: b9ff604cff11 ("timekeeping: Add ktime_get_coarse_with_offset")
Reported-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Waiman Long <longman@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1906132136280.1791@nanos.tec.linutronix.de
---
 kernel/time/timekeeping.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85f5912d8f70..44b726bab4bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
 	ktime_t base, *offset = offsets[offs];
+	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		base = ktime_add(tk->tkr_mono.base, *offset);
+		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	return base;
-
+	return base + nsecs;
 }
 EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
 
-- 
cgit v1.2.3


From becf33f694dc50656766e0fde8883437d5c8d4b4 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Mon, 10 Jun 2019 13:00:16 +0900
Subject: tracing: Fix out-of-range read in trace_stack_print()

Puts range check before dereferencing the pointer.

Reproducer:

  # echo stacktrace > trace_options
  # echo 1 > events/enable
  # cat trace > /dev/null

KASAN report:

  ==================================================================
  BUG: KASAN: use-after-free in trace_stack_print+0x26b/0x2c0
  Read of size 8 at addr ffff888069d20000 by task cat/1953

  CPU: 0 PID: 1953 Comm: cat Not tainted 5.2.0-rc3+ #5
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
  Call Trace:
   dump_stack+0x8a/0xce
   print_address_description+0x60/0x224
   ? trace_stack_print+0x26b/0x2c0
   ? trace_stack_print+0x26b/0x2c0
   __kasan_report.cold+0x1a/0x3e
   ? trace_stack_print+0x26b/0x2c0
   kasan_report+0xe/0x20
   trace_stack_print+0x26b/0x2c0
   print_trace_line+0x6ea/0x14d0
   ? tracing_buffers_read+0x700/0x700
   ? trace_find_next_entry_inc+0x158/0x1d0
   s_show+0xea/0x310
   seq_read+0xaa7/0x10e0
   ? seq_escape+0x230/0x230
   __vfs_read+0x7c/0x100
   vfs_read+0x16c/0x3a0
   ksys_read+0x121/0x240
   ? kernel_write+0x110/0x110
   ? perf_trace_sys_enter+0x8a0/0x8a0
   ? syscall_slow_exit_work+0xa9/0x410
   do_syscall_64+0xb7/0x390
   ? prepare_exit_to_usermode+0x165/0x200
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f867681f910
  Code: b6 fe ff ff 48 8d 3d 0f be 08 00 48 83 ec 08 e8 06 db 01 00 66 0f 1f 44 00 00 83 3d f9 2d 2c 00 00 75 10 b8 00 00 00 00 04
  RSP: 002b:00007ffdabf23488 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
  RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f867681f910
  RDX: 0000000000020000 RSI: 00007f8676cde000 RDI: 0000000000000003
  RBP: 00007f8676cde000 R08: ffffffffffffffff R09: 0000000000000000
  R10: 0000000000000871 R11: 0000000000000246 R12: 00007f8676cde000
  R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000000ec0

  Allocated by task 1214:
   save_stack+0x1b/0x80
   __kasan_kmalloc.constprop.0+0xc2/0xd0
   kmem_cache_alloc+0xaf/0x1a0
   getname_flags+0xd2/0x5b0
   do_sys_open+0x277/0x5a0
   do_syscall_64+0xb7/0x390
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  Freed by task 1214:
   save_stack+0x1b/0x80
   __kasan_slab_free+0x12c/0x170
   kmem_cache_free+0x8a/0x1c0
   putname+0xe1/0x120
   do_sys_open+0x2c5/0x5a0
   do_syscall_64+0xb7/0x390
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  The buggy address belongs to the object at ffff888069d20000
   which belongs to the cache names_cache of size 4096
  The buggy address is located 0 bytes inside of
   4096-byte region [ffff888069d20000, ffff888069d21000)
  The buggy address belongs to the page:
  page:ffffea0001a74800 refcount:1 mapcount:0 mapping:ffff88806ccd1380 index:0x0 compound_mapcount: 0
  flags: 0x100000000010200(slab|head)
  raw: 0100000000010200 dead000000000100 dead000000000200 ffff88806ccd1380
  raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000
  page dumped because: kasan: bad access detected

  Memory state around the buggy address:
   ffff888069d1ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
   ffff888069d1ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  >ffff888069d20000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                     ^
   ffff888069d20080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
   ffff888069d20100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ==================================================================

Link: http://lkml.kernel.org/r/20190610040016.5598-1-devel@etsukata.com

Fixes: 4285f2fcef80 ("tracing: Remove the ULONG_MAX stack trace hackery")
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_seq_puts(s, "<stack trace>\n");
 
-	for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+	for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
 
 		if (trace_seq_has_overflowed(s))
 			break;
-- 
cgit v1.2.3


From cbdaeaf050b730ea02e9ab4ff844ce54d85dbe1d Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Wed, 5 Jun 2019 13:11:58 +0200
Subject: tracing: avoid build warning with HAVE_NOP_MCOUNT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selecting HAVE_NOP_MCOUNT enables -mnop-mcount (if gcc supports it)
and sets CC_USING_NOP_MCOUNT. Reuse __is_defined (which is suitable for
testing CC_USING_* defines) to avoid conditional compilation and fix
the following gcc 9 warning on s390:

kernel/trace/ftrace.c:2514:1: warning: ‘ftrace_code_disable’ defined
but not used [-Wunused-function]

Link: http://lkml.kernel.org/r/patch.git-1a82d13f33ac.your-ad-here.call-01559732716-ext-6629@work.hours

Fixes: 2f4df0017baed ("tracing: Add -mcount-nop option support")
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..e77a6c92620f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2935,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 			p = &pg->records[i];
 			p->flags = rec_flags;
 
-#ifndef CC_USING_NOP_MCOUNT
 			/*
 			 * Do the initial record conversion from mcount jump
 			 * to the NOP instructions.
 			 */
-			if (!ftrace_code_disable(mod, p))
+			if (!__is_defined(CC_USING_NOP_MCOUNT) &&
+			    !ftrace_code_disable(mod, p))
 				break;
-#endif
 
 			update_cnt++;
 		}
-- 
cgit v1.2.3


From ff585c5b9a27e64084c84e2ddf24fd00bf8dcfc1 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 14 Jun 2019 23:32:10 +0800
Subject: tracing: Make two symbols static

Fix sparse warnings:

kernel/trace/trace.c:6927:24: warning:
 symbol 'get_tracing_log_err' was not declared. Should it be static?
kernel/trace/trace.c:8196:15: warning:
 symbol 'trace_instance_dir' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20190614153210.24424-1-yuehaibing@huawei.com

Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1c80521fd436..83e08b78dbee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6923,7 +6923,7 @@ struct tracing_log_err {
 
 static DEFINE_MUTEX(tracing_err_log_lock);
 
-struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
 {
 	struct tracing_log_err *err;
 
@@ -8192,7 +8192,7 @@ static const struct file_operations buffer_percent_fops = {
 	.llseek		= default_llseek,
 };
 
-struct dentry *trace_instance_dir;
+static struct dentry *trace_instance_dir;
 
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
-- 
cgit v1.2.3


From f01098c74b5219f3969d4750eeed1a36bfc038e3 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Fri, 14 Jun 2019 16:40:25 +0900
Subject: tracing/uprobe: Fix NULL pointer dereference in trace_uprobe_create()

Just like the case of commit 8b05a3a7503c ("tracing/kprobes: Fix NULL
pointer dereference in trace_kprobe_create()"), writing an incorrectly
formatted string to uprobe_events can trigger NULL pointer dereference.

Reporeducer:

  # echo r > /sys/kernel/debug/tracing/uprobe_events

dmesg:

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 8000000079d12067 P4D 8000000079d12067 PUD 7b7ab067 PMD 0
  Oops: 0000 [#1] PREEMPT SMP PTI
  CPU: 0 PID: 1903 Comm: bash Not tainted 5.2.0-rc3+ #15
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
  RIP: 0010:strchr+0x0/0x30
  Code: c0 eb 0d 84 c9 74 18 48 83 c0 01 48 39 d0 74 0f 0f b6 0c 07 3a 0c 06 74 ea 19 c0 83 c8 01 c3 31 c0 c3 0f 1f 84 00 00 00 00 00 <0f> b6 07 89 f2 40 38 f0 75 0e eb 13 0f b6 47 01 48 83 c
  RSP: 0018:ffffb55fc0403d10 EFLAGS: 00010293

  RAX: ffff993ffb793400 RBX: 0000000000000000 RCX: ffffffffa4852625
  RDX: 0000000000000000 RSI: 000000000000002f RDI: 0000000000000000
  RBP: ffffb55fc0403dd0 R08: ffff993ffb793400 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
  R13: ffff993ff9cc1668 R14: 0000000000000001 R15: 0000000000000000
  FS:  00007f30c5147700(0000) GS:ffff993ffda00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 000000007b628000 CR4: 00000000000006f0
  Call Trace:
   trace_uprobe_create+0xe6/0xb10
   ? __kmalloc_track_caller+0xe6/0x1c0
   ? __kmalloc+0xf0/0x1d0
   ? trace_uprobe_create+0xb10/0xb10
   create_or_delete_trace_uprobe+0x35/0x90
   ? trace_uprobe_create+0xb10/0xb10
   trace_run_command+0x9c/0xb0
   trace_parse_run_command+0xf9/0x1eb
   ? probes_open+0x80/0x80
   __vfs_write+0x43/0x90
   vfs_write+0x14a/0x2a0
   ksys_write+0xa2/0x170
   do_syscall_64+0x7f/0x200
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Link: http://lkml.kernel.org/r/20190614074026.8045-1-devel@etsukata.com

Cc: stable@vger.kernel.org
Fixes: 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for uprobe events")
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index eb7e06b54741..a88c692e3b8a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -443,10 +443,17 @@ static int trace_uprobe_create(int argc, const char **argv)
 	ret = 0;
 	ref_ctr_offset = 0;
 
-	/* argc must be >= 1 */
-	if (argv[0][0] == 'r')
+	switch (argv[0][0]) {
+	case 'r':
 		is_return = true;
-	else if (argv[0][0] != 'p' || argc < 2)
+		break;
+	case 'p':
+		break;
+	default:
+		return -ECANCELED;
+	}
+
+	if (argc < 2)
 		return -ECANCELED;
 
 	if (argv[0][1] == ':')
-- 
cgit v1.2.3


From a4158345ec5acb44cc0a9ef4381e0784c1bc7722 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Fri, 14 Jun 2019 16:40:26 +0900
Subject: tracing/uprobe: Fix obsolete comment on trace_uprobe_create()

Commit 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for
uprobe events") cleaned up the usage of trace_uprobe_create(), and the
function has been no longer used for removing uprobe/uretprobe.

Link: http://lkml.kernel.org/r/20190614074026.8045-2-devel@etsukata.com

Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a88c692e3b8a..b55906c77ce0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -426,8 +426,6 @@ end:
 /*
  * Argument syntax:
  *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
- *
- *  - Remove uprobe: -:[GRP/]EVENT
  */
 static int trace_uprobe_create(int argc, const char **argv)
 {
-- 
cgit v1.2.3


From 9f255b632bf12c4dd7fc31caee89aa991ef75176 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 13 Jun 2019 20:07:22 -0500
Subject: module: Fix livepatch/ftrace module text permissions race

It's possible for livepatch and ftrace to be toggling a module's text
permissions at the same time, resulting in the following panic:

  BUG: unable to handle page fault for address: ffffffffc005b1d9
  #PF: supervisor write access in kernel mode
  #PF: error_code(0x0003) - permissions violation
  PGD 3ea0c067 P4D 3ea0c067 PUD 3ea0e067 PMD 3cc13067 PTE 3b8a1061
  Oops: 0003 [#1] PREEMPT SMP PTI
  CPU: 1 PID: 453 Comm: insmod Tainted: G           O  K   5.2.0-rc1-a188339ca5 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014
  RIP: 0010:apply_relocate_add+0xbe/0x14c
  Code: fa 0b 74 21 48 83 fa 18 74 38 48 83 fa 0a 75 40 eb 08 48 83 38 00 74 33 eb 53 83 38 00 75 4e 89 08 89 c8 eb 0a 83 38 00 75 43 <89> 08 48 63 c1 48 39 c8 74 2e eb 48 83 38 00 75 32 48 29 c1 89 08
  RSP: 0018:ffffb223c00dbb10 EFLAGS: 00010246
  RAX: ffffffffc005b1d9 RBX: 0000000000000000 RCX: ffffffff8b200060
  RDX: 000000000000000b RSI: 0000004b0000000b RDI: ffff96bdfcd33000
  RBP: ffffb223c00dbb38 R08: ffffffffc005d040 R09: ffffffffc005c1f0
  R10: ffff96bdfcd33c40 R11: ffff96bdfcd33b80 R12: 0000000000000018
  R13: ffffffffc005c1f0 R14: ffffffffc005e708 R15: ffffffff8b2fbc74
  FS:  00007f5f447beba8(0000) GS:ffff96bdff900000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffffffc005b1d9 CR3: 000000003cedc002 CR4: 0000000000360ea0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   klp_init_object_loaded+0x10f/0x219
   ? preempt_latency_start+0x21/0x57
   klp_enable_patch+0x662/0x809
   ? virt_to_head_page+0x3a/0x3c
   ? kfree+0x8c/0x126
   patch_init+0x2ed/0x1000 [livepatch_test02]
   ? 0xffffffffc0060000
   do_one_initcall+0x9f/0x1c5
   ? kmem_cache_alloc_trace+0xc4/0xd4
   ? do_init_module+0x27/0x210
   do_init_module+0x5f/0x210
   load_module+0x1c41/0x2290
   ? fsnotify_path+0x3b/0x42
   ? strstarts+0x2b/0x2b
   ? kernel_read+0x58/0x65
   __do_sys_finit_module+0x9f/0xc3
   ? __do_sys_finit_module+0x9f/0xc3
   __x64_sys_finit_module+0x1a/0x1c
   do_syscall_64+0x52/0x61
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

The above panic occurs when loading two modules at the same time with
ftrace enabled, where at least one of the modules is a livepatch module:

CPU0					CPU1
klp_enable_patch()
  klp_init_object_loaded()
    module_disable_ro()
    					ftrace_module_enable()
					  ftrace_arch_code_modify_post_process()
				    	    set_all_modules_text_ro()
      klp_write_object_relocations()
        apply_relocate_add()
	  *patches read-only code* - BOOM

A similar race exists when toggling ftrace while loading a livepatch
module.

Fix it by ensuring that the livepatch and ftrace code patching
operations -- and their respective permissions changes -- are protected
by the text_mutex.

Link: http://lkml.kernel.org/r/ab43d56ab909469ac5d2520c5d944ad6d4abd476.1560474114.git.jpoimboe@redhat.com

Reported-by: Johannes Erdfelt <johannes@erdfelt.com>
Fixes: 444d13ff10fb ("modules: add ro_after_init support")
Acked-by: Jessica Yu <jeyu@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/livepatch/core.c |  6 ++++++
 kernel/trace/ftrace.c   | 10 +++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 91cd519756d3..2d17e6e364b5 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -30,6 +30,7 @@
 #include <linux/elf.h>
 #include <linux/moduleloader.h>
 #include <linux/completion.h>
+#include <linux/memory.h>
 #include <asm/cacheflush.h>
 #include "core.h"
 #include "patch.h"
@@ -730,16 +731,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 	struct klp_func *func;
 	int ret;
 
+	mutex_lock(&text_mutex);
+
 	module_disable_ro(patch->mod);
 	ret = klp_write_object_relocations(patch->mod, obj);
 	if (ret) {
 		module_enable_ro(patch->mod, true);
+		mutex_unlock(&text_mutex);
 		return ret;
 	}
 
 	arch_klp_init_object_loaded(patch, obj);
 	module_enable_ro(patch->mod, true);
 
+	mutex_unlock(&text_mutex);
+
 	klp_for_each_func(obj, func) {
 		ret = klp_find_object_symbol(obj->name, func->old_name,
 					     func->old_sympos,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e77a6c92620f..a89700590485 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
 #include <linux/kprobes.h>
+#include <linux/memory.h>
 
 #include <trace/events/sched.h>
 
@@ -2610,10 +2611,12 @@ static void ftrace_run_update_code(int command)
 {
 	int ret;
 
+	mutex_lock(&text_mutex);
+
 	ret = ftrace_arch_code_modify_prepare();
 	FTRACE_WARN_ON(ret);
 	if (ret)
-		return;
+		goto out_unlock;
 
 	/*
 	 * By default we use stop_machine() to modify the code.
@@ -2625,6 +2628,9 @@ static void ftrace_run_update_code(int command)
 
 	ret = ftrace_arch_code_modify_post_process();
 	FTRACE_WARN_ON(ret);
+
+out_unlock:
+	mutex_unlock(&text_mutex);
 }
 
 static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -5775,6 +5781,7 @@ void ftrace_module_enable(struct module *mod)
 	struct ftrace_page *pg;
 
 	mutex_lock(&ftrace_lock);
+	mutex_lock(&text_mutex);
 
 	if (ftrace_disabled)
 		goto out_unlock;
@@ -5836,6 +5843,7 @@ void ftrace_module_enable(struct module *mod)
 		ftrace_arch_code_modify_post_process();
 
  out_unlock:
+	mutex_unlock(&text_mutex);
 	mutex_unlock(&ftrace_lock);
 
 	process_cached_mods(mod->name);
-- 
cgit v1.2.3


From 04e03d9a616c19a47178eaca835358610e63a1dd Mon Sep 17 00:00:00 2001
From: Wei Li <liwei391@huawei.com>
Date: Thu, 6 Jun 2019 11:17:54 +0800
Subject: ftrace: Fix NULL pointer dereference in free_ftrace_func_mapper()

The mapper may be NULL when called from register_ftrace_function_probe()
with probe->data == NULL.

This issue can be reproduced as follow (it may be covered by compiler
optimization sometime):

/ # cat /sys/kernel/debug/tracing/set_ftrace_filter
#### all functions enabled ####
/ # echo foo_bar:dump > /sys/kernel/debug/tracing/set_ftrace_filter
[  206.949100] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
[  206.952402] Mem abort info:
[  206.952819]   ESR = 0x96000006
[  206.955326]   Exception class = DABT (current EL), IL = 32 bits
[  206.955844]   SET = 0, FnV = 0
[  206.956272]   EA = 0, S1PTW = 0
[  206.956652] Data abort info:
[  206.957320]   ISV = 0, ISS = 0x00000006
[  206.959271]   CM = 0, WnR = 0
[  206.959938] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000419f3a000
[  206.960483] [0000000000000000] pgd=0000000411a87003, pud=0000000411a83003, pmd=0000000000000000
[  206.964953] Internal error: Oops: 96000006 [#1] SMP
[  206.971122] Dumping ftrace buffer:
[  206.973677]    (ftrace buffer empty)
[  206.975258] Modules linked in:
[  206.976631] Process sh (pid: 281, stack limit = 0x(____ptrval____))
[  206.978449] CPU: 10 PID: 281 Comm: sh Not tainted 5.2.0-rc1+ #17
[  206.978955] Hardware name: linux,dummy-virt (DT)
[  206.979883] pstate: 60000005 (nZCv daif -PAN -UAO)
[  206.980499] pc : free_ftrace_func_mapper+0x2c/0x118
[  206.980874] lr : ftrace_count_free+0x68/0x80
[  206.982539] sp : ffff0000182f3ab0
[  206.983102] x29: ffff0000182f3ab0 x28: ffff8003d0ec1700
[  206.983632] x27: ffff000013054b40 x26: 0000000000000001
[  206.984000] x25: ffff00001385f000 x24: 0000000000000000
[  206.984394] x23: ffff000013453000 x22: ffff000013054000
[  206.984775] x21: 0000000000000000 x20: ffff00001385fe28
[  206.986575] x19: ffff000013872c30 x18: 0000000000000000
[  206.987111] x17: 0000000000000000 x16: 0000000000000000
[  206.987491] x15: ffffffffffffffb0 x14: 0000000000000000
[  206.987850] x13: 000000000017430e x12: 0000000000000580
[  206.988251] x11: 0000000000000000 x10: cccccccccccccccc
[  206.988740] x9 : 0000000000000000 x8 : ffff000013917550
[  206.990198] x7 : ffff000012fac2e8 x6 : ffff000012fac000
[  206.991008] x5 : ffff0000103da588 x4 : 0000000000000001
[  206.991395] x3 : 0000000000000001 x2 : ffff000013872a28
[  206.991771] x1 : 0000000000000000 x0 : 0000000000000000
[  206.992557] Call trace:
[  206.993101]  free_ftrace_func_mapper+0x2c/0x118
[  206.994827]  ftrace_count_free+0x68/0x80
[  206.995238]  release_probe+0xfc/0x1d0
[  206.995555]  register_ftrace_function_probe+0x4a8/0x868
[  206.995923]  ftrace_trace_probe_callback.isra.4+0xb8/0x180
[  206.996330]  ftrace_dump_callback+0x50/0x70
[  206.996663]  ftrace_regex_write.isra.29+0x290/0x3a8
[  206.997157]  ftrace_filter_write+0x44/0x60
[  206.998971]  __vfs_write+0x64/0xf0
[  206.999285]  vfs_write+0x14c/0x2f0
[  206.999591]  ksys_write+0xbc/0x1b0
[  206.999888]  __arm64_sys_write+0x3c/0x58
[  207.000246]  el0_svc_common.constprop.0+0x408/0x5f0
[  207.000607]  el0_svc_handler+0x144/0x1c8
[  207.000916]  el0_svc+0x8/0xc
[  207.003699] Code: aa0003f8 a9025bf5 aa0103f5 f946ea80 (f9400303)
[  207.008388] ---[ end trace 7b6d11b5f542bdf1 ]---
[  207.010126] Kernel panic - not syncing: Fatal exception
[  207.011322] SMP: stopping secondary CPUs
[  207.013956] Dumping ftrace buffer:
[  207.014595]    (ftrace buffer empty)
[  207.015632] Kernel Offset: disabled
[  207.017187] CPU features: 0x002,20006008
[  207.017985] Memory Limit: none
[  207.019825] ---[ end Kernel panic - not syncing: Fatal exception ]---

Link: http://lkml.kernel.org/r/20190606031754.10798-1-liwei391@huawei.com

Signed-off-by: Wei Li <liwei391@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a89700590485..38277af44f5c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4226,10 +4226,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
 	struct ftrace_func_entry *entry;
 	struct ftrace_func_map *map;
 	struct hlist_head *hhd;
-	int size = 1 << mapper->hash.size_bits;
-	int i;
+	int size, i;
+
+	if (!mapper)
+		return;
 
 	if (free_func && mapper->hash.count) {
+		size = 1 << mapper->hash.size_bits;
 		for (i = 0; i < size; i++) {
 			hhd = &mapper->hash.buckets[i];
 			hlist_for_each_entry(entry, hhd, hlist) {
-- 
cgit v1.2.3


From d4dd153d551634683fccf8881f606fa9f3dfa1ef Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <toshiaki.makita1@gmail.com>
Date: Fri, 14 Jun 2019 17:20:13 +0900
Subject: bpf, devmap: Fix premature entry free on destroying map

dev_map_free() waits for flush_needed bitmap to be empty in order to
ensure all flush operations have completed before freeing its entries.
However the corresponding clear_bit() was called before using the
entries, so the entries could be used after free.

All access to the entries needs to be done before clearing the bit.
It seems commit a5e2da6e9787 ("bpf: netdev is never null in
__dev_map_flush") accidentally changed the clear_bit() and memory access
order.

Note that the problem happens only in __dev_map_flush(), not in
dev_map_flush_old(). dev_map_flush_old() is called only after nulling
out the corresponding netdev_map entry, so dev_map_free() never frees
the entry thus no such race happens there.

Fixes: a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush")
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/devmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1e525d70f833..e001fb1a96b1 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -291,10 +291,10 @@ void __dev_map_flush(struct bpf_map *map)
 		if (unlikely(!dev))
 			continue;
 
-		__clear_bit(bit, bitmap);
-
 		bq = this_cpu_ptr(dev->bulkq);
 		bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
+
+		__clear_bit(bit, bitmap);
 	}
 }
 
-- 
cgit v1.2.3


From edabf4d9dd905acd60048ea1579943801e3a4876 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <toshiaki.makita1@gmail.com>
Date: Fri, 14 Jun 2019 17:20:14 +0900
Subject: bpf, devmap: Add missing bulk queue free

dev_map_free() forgot to free bulk queue when freeing its entries.

Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking")
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/devmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e001fb1a96b1..a126d95d12de 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -186,6 +186,7 @@ static void dev_map_free(struct bpf_map *map)
 		if (!dev)
 			continue;
 
+		free_percpu(dev->bulkq);
 		dev_put(dev->dev);
 		kfree(dev);
 	}
-- 
cgit v1.2.3


From 86723c8640633bee4b4588d3c7784ee7a0032f65 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <toshiaki.makita1@gmail.com>
Date: Fri, 14 Jun 2019 17:20:15 +0900
Subject: bpf, devmap: Add missing RCU read lock on flush

.ndo_xdp_xmit() assumes it is called under RCU. For example virtio_net
uses RCU to detect it has setup the resources for tx. The assumption
accidentally broke when introducing bulk queue in devmap.

Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking")
Reported-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/devmap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a126d95d12de..1defea4b2755 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -282,6 +282,7 @@ void __dev_map_flush(struct bpf_map *map)
 	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
 	u32 bit;
 
+	rcu_read_lock();
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
 		struct xdp_bulk_queue *bq;
@@ -297,6 +298,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 		__clear_bit(bit, bitmap);
 	}
+	rcu_read_unlock();
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -389,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 
 		int cpu;
 
+		rcu_read_lock();
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
@@ -396,6 +399,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 			bq = per_cpu_ptr(dev->bulkq, cpu);
 			bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
 		}
+		rcu_read_unlock();
 	}
 }
 
-- 
cgit v1.2.3


From a8e11e5c5611a9f70470aebeb2c1dd6132f609d7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 14 Jun 2019 16:22:18 -0700
Subject: sysctl: define proc_do_static_key()

Convert proc_dointvec_minmax_bpf_stats() into a more generic
helper, since we are going to use jump labels more often.

Note that sysctl_bpf_stats_enabled is removed, since
it is no longer needed/used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c |  1 -
 kernel/sysctl.c   | 44 +++++++++++++++++++++++---------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c473f208a10..080e2bb644cc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
 
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d1008be6173..1beca96fb625 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_BPF_SYSCALL
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos);
-#endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "bpf_stats_enabled",
-		.data		= &sysctl_bpf_stats_enabled,
-		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.data		= &bpf_stats_enabled_key.key,
+		.maxlen		= sizeof(bpf_stats_enabled_key),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax_bpf_stats,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.proc_handler	= proc_do_static_key,
 	},
 #endif
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -3374,26 +3367,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos)
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp,
+		       loff_t *ppos)
 {
-	int ret, bpf_stats = *(int *)table->data;
-	struct ctl_table tmp = *table;
+	struct static_key *key = (struct static_key *)table->data;
+	static DEFINE_MUTEX(static_key_mutex);
+	int val, ret;
+	struct ctl_table tmp = {
+		.data   = &val,
+		.maxlen = sizeof(val),
+		.mode   = table->mode,
+		.extra1 = &zero,
+		.extra2 = &one,
+	};
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	tmp.data = &bpf_stats;
+	mutex_lock(&static_key_mutex);
+	val = static_key_enabled(key);
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	if (write && !ret) {
-		*(int *)table->data = bpf_stats;
-		if (bpf_stats)
-			static_branch_enable(&bpf_stats_enabled_key);
+		if (val)
+			static_key_enable(key);
 		else
-			static_branch_disable(&bpf_stats_enabled_key);
+			static_key_disable(key);
 	}
+	mutex_unlock(&static_key_mutex);
 	return ret;
 }
 #endif
-- 
cgit v1.2.3


From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Tue, 11 Jun 2019 14:53:04 -0700
Subject: bpf: fix nested bpf tracepoints with per-cpu data

BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
they do not increment bpf_prog_active while executing.

This enables three levels of nesting, to support
  - a kprobe or raw tp or perf event,
  - another one of the above that irq context happens to call, and
  - another one in nmi context
(at most one of which may be a kprobe or perf event).

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 100 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 84 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..1c9a4745e596 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 			u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	return perf_event_output(event, sd, regs);
 }
 
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+	struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+	struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+	int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
 			.data = data,
 		},
 	};
+	struct perf_sample_data *sd;
+	int err;
 
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	sd = &sds->sds[nest_level - 1];
+
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	perf_sample_data_init(sd, 0, 0);
 	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+	this_cpu_dec(bpf_trace_nest_level);
+	return err;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
  */
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+	struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+	struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+		this_cpu_dec(bpf_raw_tp_nest_level);
+		return ERR_PTR(-EBUSY);
+	}
+
+	return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+	this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return ____bpf_perf_event_output(regs, map, flags, data, size);
+	ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
 	/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
-	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
-			       flags, 0, 0);
+	ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			      flags, 0, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   void *, buf, u32, size, u64, flags)
 {
-	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+	struct pt_regs *regs = get_bpf_raw_tp_regs();
+	int ret;
+
+	if (IS_ERR(regs))
+		return PTR_ERR(regs);
 
 	perf_fetch_caller_regs(regs);
-	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
-			     (unsigned long) size, flags, 0);
+	ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			    (unsigned long) size, flags, 0);
+	put_bpf_raw_tp_regs();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
-- 
cgit v1.2.3


From 085ebfe937d7a7a5df1729f35a12d6d655fea68c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 May 2019 14:37:24 +0200
Subject: perf/core: Fix perf_sample_regs_user() mm check

perf_sample_regs_user() uses 'current->mm' to test for the presence of
userspace, but this is insufficient, consider use_mm().

A better test is: '!(current->flags & PF_KTHREAD)', exec() clears
PF_KTHREAD after it sets the new ->mm but before it drops to userspace
for the first time.

Possibly obsoletes: bf05fc25f268 ("powerpc/perf: Fix oops when kthread execs user process")

Reported-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reported-by: Young Xiao <92siuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 4018994f3d87 ("perf: Add ability to attach user level registers dump to sample")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..2e32faac5511 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5923,7 +5923,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
 	if (user_mode(regs)) {
 		regs_user->abi = perf_reg_abi(current);
 		regs_user->regs = regs;
-	} else if (current->mm) {
+	} else if (!(current->flags & PF_KTHREAD)) {
 		perf_get_regs_user(regs_user, regs, regs_user_copy);
 	} else {
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
-- 
cgit v1.2.3


From 40b0b3f8fb2d8f55d13ceed41593d46689a6b496 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 3 Jun 2019 07:44:46 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 230

Based on 2 normalized pattern(s):

  this source code is licensed under the gnu general public license
  version 2 see the file copying for more details

  this source code is licensed under general public license version 2
  see

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 52 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Enrico Weigelt <info@metux.net>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190602204653.449021192@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/crash_core.c | 4 +---
 kernel/kexec.c      | 4 +---
 kernel/kexec_core.c | 4 +---
 kernel/kexec_file.c | 4 +---
 4 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 093c9f917ed0..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * crash.c - kernel crash support code.
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
  */
 
 #include <linux/crash_core.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fd5c95ff9251..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kexec.c - kexec system call core code.
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 072b6ee55e3f..ef7b951a8087 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kexec: kexec_file_load system call
  *
  * Copyright (C) 2014 Red Hat Inc.
  * Authors:
  *      Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-- 
cgit v1.2.3


From 8092f73c51567470bd79472c6eb25d2e1841fac3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 3 Jun 2019 07:45:04 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 248

Based on 1 normalized pattern(s):

  this file is released under the gpl v2

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 3 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Armijn Hemel <armijn@tjaldur.nl>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190602204655.103854853@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/power/poweroff.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..6d475281c730 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * poweroff.c - sysrq handler to gracefully power down machine.
- *
- * This file is released under the GPL v2
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From f85d208658468b1a298f31daddb05a7684969cd4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Jun 2019 10:10:45 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 451

Based on 1 normalized pattern(s):

  this file is subject to the terms and conditions of version 2 of the
  gnu general public license see the file copying in the main
  directory of the linux distribution for more details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 5 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Enrico Weigelt <info@metux.net>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190604081200.872755311@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/cgroup.c   | 5 +----
 kernel/bpf/lpm_trie.c | 5 +----
 kernel/cgroup/pids.c  | 5 +----
 kernel/cgroup/rdma.c  | 5 +----
 4 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..92a7d0cf8d13 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Functions to manage eBPF programs attached to cgroups
  *
  * Copyright (c) 2016 Daniel Mack
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..1647a3f763a7 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Longest prefix match list implementation
  *
  * Copyright (c) 2016,2017 Daniel Mack
  * Copyright (c) 2016 David Herrmann
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
  */
 
 #include <linux/bpf.h>
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index c9960baaa14f..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Process number limiting controller for cgroups.
  *
@@ -25,10 +26,6 @@
  * a superset of parent/child/pids.current.
  *
  * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
  */
 
 #include <linux/kernel.h>
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 1d75ae7f1cb7..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * RDMA resource limiting controller for cgroups.
  *
@@ -5,10 +6,6 @@
  * additional RDMA resources after a certain limit is reached.
  *
  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
  */
 
 #include <linux/bitops.h>
-- 
cgit v1.2.3


From d2912cb15bdda8ba4a5dd73396ad62641af2f520 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Jun 2019 10:11:33 +0200
Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500

Based on 2 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license version 2 as
  published by the free software foundation

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license version 2 as
  published by the free software foundation #

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 4122 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Enrico Weigelt <info@metux.net>
Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org>
Reviewed-by: Allison Randal <allison@lohutok.net>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190604081206.933168790@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/bpf/inode.c   | 5 +----
 kernel/compat.c      | 5 +----
 kernel/sched/debug.c | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 84a80b02db99..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Minimal file system backend for holding eBPF maps and programs,
  * used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
  * Authors:
  *
  *	Daniel Borkmann <daniel@iogearbox.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
  */
 
 #include <linux/init.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index b5f7063c0db6..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/kernel/compat.c
  *
@@ -5,10 +6,6 @@
  *  on 64 bit kernels.
  *
  *  Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
  */
 
 #include <linux/linkage.h>
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 678bfb9bd87f..14c6a8716ba1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kernel/sched/debug.c
  *
  * Print the CFS rbtree and other debugging details
  *
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 #include "sched.h"
 
-- 
cgit v1.2.3


From 9014143bab2f3bc0b9e5db3bc8d00e2a43e50fbd Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Sun, 23 Jun 2019 14:27:17 +0300
Subject: fork: don't check parent_tidptr with CLONE_PIDFD

Give userspace a cheap and reliable way to tell whether CLONE_PIDFD is
supported by the kernel or not. The easiest way is to pass an invalid
file descriptor value in parent_tidptr, perform the syscall and verify
that parent_tidptr has been changed to a valid file descriptor value.

CLONE_PIDFD uses parent_tidptr to return pidfds. CLONE_PARENT_SETTID
will use parent_tidptr to return the tid of the parent. The two flags
cannot be used together. Old kernels that only support
CLONE_PARENT_SETTID will not verify the value pointed to by
parent_tidptr. This behavior is unchanged even with the introduction of
CLONE_PIDFD.
However, if CLONE_PIDFD is specified the kernel will currently check the
value pointed to by parent_tidptr before placing the pidfd in the memory
pointed to. EINVAL will be returned if the value in parent_tidptr is not
0.

If CLONE_PIDFD is supported and fd 0 is closed, then the returned pidfd
can and likely will be 0 and parent_tidptr will be unchanged. This means
userspace must either check CLONE_PIDFD support beforehand or check that
fd 0 is not closed when invoking CLONE_PIDFD.

The check for pidfd == 0 was introduced during the v5.2 merge window by
commit b3e583825266 ("clone: add CLONE_PIDFD") to ensure that
CLONE_PIDFD could be potentially extended by passing in flags through
the return argument.

However, that extension would look horrible, and with the upcoming
introduction of the clone3 syscall in v5.3 there is no need to extend
legacy clone syscall this way. (Even if it would need to be extended,
CLONE_DETACHED can be reused with CLONE_PIDFD.)

So remove the pidfd == 0 check. Userspace that needs to be portable to
kernels without CLONE_PIDFD support can then be advised to initialize
pidfd to -1 and check the pidfd value returned by CLONE_PIDFD.

Fixes: b3e583825266 ("clone: add CLONE_PIDFD")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 kernel/fork.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..39a3adaa4ad1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1822,8 +1822,6 @@ static __latent_entropy struct task_struct *copy_process(
 	}
 
 	if (clone_flags & CLONE_PIDFD) {
-		int reserved;
-
 		/*
 		 * - CLONE_PARENT_SETTID is useless for pidfds and also
 		 *   parent_tidptr is used to return pidfds.
@@ -1834,16 +1832,6 @@ static __latent_entropy struct task_struct *copy_process(
 		if (clone_flags &
 		    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
 			return ERR_PTR(-EINVAL);
-
-		/*
-		 * Verify that parent_tidptr is sane so we can potentially
-		 * reuse it later.
-		 */
-		if (get_user(reserved, parent_tidptr))
-			return ERR_PTR(-EFAULT);
-
-		if (reserved != 0)
-			return ERR_PTR(-EINVAL);
 	}
 
 	/*
-- 
cgit v1.2.3


From 913a90bc5a3a06b1f04c337320e9aeee2328dd77 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Date: Tue, 4 Jun 2019 09:59:53 +0530
Subject: perf/ioctl: Add check for the sample_period value

perf_event_open() limits the sample_period to 63 bits. See:

  0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits")

Make ioctl() consistent with it.

Also on PowerPC, negative sample_period could cause a recursive
PMIs leading to a hang (reported when running perf-fuzzer).

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: maddy@linux.vnet.ibm.com
Cc: mpe@ellerman.id.au
Fixes: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits")
Link: https://lkml.kernel.org/r/20190604042953.914-1-ravi.bangoria@linux.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2e32faac5511..8d1c62df20a7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5005,6 +5005,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 	if (perf_event_check_period(event, value))
 		return -EINVAL;
 
+	if (!event->attr.freq && (value & (1ULL << 63)))
+		return -EINVAL;
+
 	event_function_call(event, __perf_event_period, &value);
 
 	return 0;
-- 
cgit v1.2.3


From e321d02db87af7840da29ef833a2a71fc0eab198 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 28 May 2019 15:08:30 -0700
Subject: perf/x86: Disable extended registers for non-supported PMUs

The perf fuzzer caused Skylake machine to crash:

[ 9680.085831] Call Trace:
[ 9680.088301]  <IRQ>
[ 9680.090363]  perf_output_sample_regs+0x43/0xa0
[ 9680.094928]  perf_output_sample+0x3aa/0x7a0
[ 9680.099181]  perf_event_output_forward+0x53/0x80
[ 9680.103917]  __perf_event_overflow+0x52/0xf0
[ 9680.108266]  ? perf_trace_run_bpf_submit+0xc0/0xc0
[ 9680.113108]  perf_swevent_hrtimer+0xe2/0x150
[ 9680.117475]  ? check_preempt_wakeup+0x181/0x230
[ 9680.122091]  ? check_preempt_curr+0x62/0x90
[ 9680.126361]  ? ttwu_do_wakeup+0x19/0x140
[ 9680.130355]  ? try_to_wake_up+0x54/0x460
[ 9680.134366]  ? reweight_entity+0x15b/0x1a0
[ 9680.138559]  ? __queue_work+0x103/0x3f0
[ 9680.142472]  ? update_dl_rq_load_avg+0x1cd/0x270
[ 9680.147194]  ? timerqueue_del+0x1e/0x40
[ 9680.151092]  ? __remove_hrtimer+0x35/0x70
[ 9680.155191]  __hrtimer_run_queues+0x100/0x280
[ 9680.159658]  hrtimer_interrupt+0x100/0x220
[ 9680.163835]  smp_apic_timer_interrupt+0x6a/0x140
[ 9680.168555]  apic_timer_interrupt+0xf/0x20
[ 9680.172756]  </IRQ>

The XMM registers can only be collected by PEBS hardware events on the
platforms with PEBS baseline support, e.g. Icelake, not software/probe
events.

Add capabilities flag PERF_PMU_CAP_EXTENDED_REGS to indicate the PMU
which support extended registers. For X86, the extended registers are
XMM registers.

Add has_extended_regs() to check if extended registers are applied.

The generic code define the mask of extended registers as 0 if arch
headers haven't overridden it.

Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 878068ea270e ("perf/x86: Support outputting XMM registers")
Link: https://lkml.kernel.org/r/1559081314-9714-1-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8d1c62df20a7..f85929ce13be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10036,6 +10036,12 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
+static inline bool has_extended_regs(struct perf_event *event)
+{
+	return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+	       (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
 	struct perf_event_context *ctx = NULL;
@@ -10067,12 +10073,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		perf_event_ctx_unlock(event->group_leader, ctx);
 
 	if (!ret) {
+		if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+		    has_extended_regs(event))
+			ret = -EOPNOTSUPP;
+
 		if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
-				event_has_any_exclude_flag(event)) {
-			if (event->destroy)
-				event->destroy(event);
+		    event_has_any_exclude_flag(event))
 			ret = -EINVAL;
-		}
+
+		if (ret && event->destroy)
+			event->destroy(event);
 	}
 
 	if (ret)
-- 
cgit v1.2.3


From 1bf72720281770162c87990697eae1ba2f1d917a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 16 May 2019 09:09:35 +0200
Subject: cpu/speculation: Warn on unsupported mitigations= parameter

Currently, if the user specifies an unsupported mitigation strategy on the
kernel command line, it will be ignored silently.  The code will fall back
to the default strategy, possibly leaving the system more vulnerable than
expected.

This may happen due to e.g. a simple typo, or, for a stable kernel release,
because not all mitigation strategies have been backported.

Inform the user by printing a message.

Fixes: 98af8452945c5565 ("cpu/speculation: Add 'mitigations=' cmdline option")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20190516070935.22546-1-geert@linux-m68k.org
---
 kernel/cpu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 077fde6fb953..551db494f153 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2339,6 +2339,9 @@ static int __init mitigations_parse_cmdline(char *arg)
 		cpu_mitigations = CPU_MITIGATIONS_AUTO;
 	else if (!strcmp(arg, "auto,nosmt"))
 		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+	else
+		pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
+			arg);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 471a739a47aa7d582f0cdf9d392957d04632bae2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 26 Jun 2019 00:20:23 +0200
Subject: PCI: PM: Avoid skipping bus-level PM on platforms without ACPI

There are platforms that do not call pm_set_suspend_via_firmware(),
so pm_suspend_via_firmware() returns 'false' on them, but the power
states of PCI devices (PCIe ports in particular) are changed as a
result of powering down core platform components during system-wide
suspend.  Thus the pm_suspend_via_firmware() checks in
pci_pm_suspend_noirq() and pci_pm_resume_noirq() introduced by
commit 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to-
idle") are not sufficient to determine that devices left in D0
during suspend will remain in D0 during resume and so the bus-level
power management can be skipped for them.

For this reason, introduce a new global suspend flag,
PM_SUSPEND_FLAG_NO_PLATFORM, set it for suspend-to-idle only
and replace the pm_suspend_via_firmware() checks mentioned above
with checks against this flag.

Fixes: 3e26c5feed2a ("PCI: PM: Skip devices in D0 for suspend-to-idle")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 kernel/power/suspend.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 9505101ed2bc..096211299c07 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -493,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state)
 
 	pm_suspend_target_state = state;
 
+	if (state == PM_SUSPEND_TO_IDLE)
+		pm_set_suspend_no_platform();
+
 	error = platform_suspend_begin(state);
 	if (error)
 		goto Close;
-- 
cgit v1.2.3


From 33d4a5a7a5b4d02915d765064b2319e90a11cbde Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Thu, 27 Jun 2019 11:47:32 +0900
Subject: cpu/hotplug: Fix out-of-bounds read when setting fail state

Setting invalid value to /sys/devices/system/cpu/cpuX/hotplug/fail
can control `struct cpuhp_step *sp` address, results in the following
global-out-of-bounds read.

Reproducer:

  # echo -2 > /sys/devices/system/cpu/cpu0/hotplug/fail

KASAN report:

  BUG: KASAN: global-out-of-bounds in write_cpuhp_fail+0x2cd/0x2e0
  Read of size 8 at addr ffffffff89734438 by task bash/1941

  CPU: 0 PID: 1941 Comm: bash Not tainted 5.2.0-rc6+ #31
  Call Trace:
   write_cpuhp_fail+0x2cd/0x2e0
   dev_attr_store+0x58/0x80
   sysfs_kf_write+0x13d/0x1a0
   kernfs_fop_write+0x2bc/0x460
   vfs_write+0x1e1/0x560
   ksys_write+0x126/0x250
   do_syscall_64+0xc1/0x390
   entry_SYSCALL_64_after_hwframe+0x49/0xbe
  RIP: 0033:0x7f05e4f4c970

  The buggy address belongs to the variable:
   cpu_hotplug_lock+0x98/0xa0

  Memory state around the buggy address:
   ffffffff89734300: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 00
   ffffffff89734380: fa fa fa fa 00 00 00 00 00 00 00 00 00 00 00 00
  >ffffffff89734400: 00 00 00 00 fa fa fa fa 00 00 00 00 fa fa fa fa
                                          ^
   ffffffff89734480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
   ffffffff89734500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

Add a sanity check for the value written from user space.

Fixes: 1db49484f21ed ("smp/hotplug: Hotplug state fail injection")
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Link: https://lkml.kernel.org/r/20190627024732.31672-1-devel@etsukata.com
---
 kernel/cpu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 551db494f153..ef1c565edc5d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1964,6 +1964,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
 	if (ret)
 		return ret;
 
+	if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+		return -EINVAL;
+
 	/*
 	 * Cannot fail STARTING/DYING callbacks.
 	 */
-- 
cgit v1.2.3


From 6fd2fe494b17bf2dec37b610d23a43a72b16923a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Jun 2019 22:22:09 -0400
Subject: copy_process(): don't use ksys_close() on cleanups

anon_inode_getfd() should be used *ONLY* in situations when we are
guaranteed to be past the last failure point (including copying the
descriptor number to userland, at that).  And ksys_close() should
not be used for cleanups at all.

anon_inode_getfile() is there for all nontrivial cases like that.
Just use that...

Fixes: b3e583825266 ("clone: add CLONE_PIDFD")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Jann Horn <jannh@google.com>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 kernel/fork.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 39a3adaa4ad1..399aca51ff75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1712,31 +1712,6 @@ const struct file_operations pidfd_fops = {
 #endif
 };
 
-/**
- * pidfd_create() - Create a new pid file descriptor.
- *
- * @pid:  struct pid that the pidfd will reference
- *
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
- *
- * Note, that this function can only be called after the fd table has
- * been unshared to avoid leaking the pidfd to the new process.
- *
- * Return: On success, a cloexec pidfd is returned.
- *         On error, a negative errno number will be returned.
- */
-static int pidfd_create(struct pid *pid)
-{
-	int fd;
-
-	fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
-			      O_RDWR | O_CLOEXEC);
-	if (fd < 0)
-		put_pid(pid);
-
-	return fd;
-}
-
 static void __delayed_free_task(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1774,6 +1749,7 @@ static __latent_entropy struct task_struct *copy_process(
 	int pidfd = -1, retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
+	struct file *pidfile = NULL;
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -2046,11 +2022,20 @@ static __latent_entropy struct task_struct *copy_process(
 	 * if the fd table isn't shared).
 	 */
 	if (clone_flags & CLONE_PIDFD) {
-		retval = pidfd_create(pid);
+		retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 
 		pidfd = retval;
+
+		pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+					      O_RDWR | O_CLOEXEC);
+		if (IS_ERR(pidfile)) {
+			put_unused_fd(pidfd);
+			goto bad_fork_free_pid;
+		}
+		get_pid(pid);	/* held by pidfile now */
+
 		retval = put_user(pidfd, parent_tidptr);
 		if (retval)
 			goto bad_fork_put_pidfd;
@@ -2168,6 +2153,9 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cancel_cgroup;
 	}
 
+	/* past the last point of failure */
+	if (pidfile)
+		fd_install(pidfd, pidfile);
 
 	init_task_pid_links(p);
 	if (likely(p->pid)) {
@@ -2234,8 +2222,10 @@ bad_fork_cancel_cgroup:
 bad_fork_cgroup_threadgroup_change_end:
 	cgroup_threadgroup_change_end(current);
 bad_fork_put_pidfd:
-	if (clone_flags & CLONE_PIDFD)
-		ksys_close(pidfd);
+	if (clone_flags & CLONE_PIDFD) {
+		fput(pidfile);
+		put_unused_fd(pidfd);
+	}
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
-- 
cgit v1.2.3


From d5b844a2cf507fc7642c9ae80a9d585db3065c28 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 27 Jun 2019 10:13:34 +0200
Subject: ftrace/x86: Remove possible deadlock between register_kprobe() and
 ftrace_run_update_code()

The commit 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module text
permissions race") causes a possible deadlock between register_kprobe()
and ftrace_run_update_code() when ftrace is using stop_machine().

The existing dependency chain (in reverse order) is:

-> #1 (text_mutex){+.+.}:
       validate_chain.isra.21+0xb32/0xd70
       __lock_acquire+0x4b8/0x928
       lock_acquire+0x102/0x230
       __mutex_lock+0x88/0x908
       mutex_lock_nested+0x32/0x40
       register_kprobe+0x254/0x658
       init_kprobes+0x11a/0x168
       do_one_initcall+0x70/0x318
       kernel_init_freeable+0x456/0x508
       kernel_init+0x22/0x150
       ret_from_fork+0x30/0x34
       kernel_thread_starter+0x0/0xc

-> #0 (cpu_hotplug_lock.rw_sem){++++}:
       check_prev_add+0x90c/0xde0
       validate_chain.isra.21+0xb32/0xd70
       __lock_acquire+0x4b8/0x928
       lock_acquire+0x102/0x230
       cpus_read_lock+0x62/0xd0
       stop_machine+0x2e/0x60
       arch_ftrace_update_code+0x2e/0x40
       ftrace_run_update_code+0x40/0xa0
       ftrace_startup+0xb2/0x168
       register_ftrace_function+0x64/0x88
       klp_patch_object+0x1a2/0x290
       klp_enable_patch+0x554/0x980
       do_one_initcall+0x70/0x318
       do_init_module+0x6e/0x250
       load_module+0x1782/0x1990
       __s390x_sys_finit_module+0xaa/0xf0
       system_call+0xd8/0x2d0

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(text_mutex);
                               lock(cpu_hotplug_lock.rw_sem);
                               lock(text_mutex);
  lock(cpu_hotplug_lock.rw_sem);

It is similar problem that has been solved by the commit 2d1e38f56622b9b
("kprobes: Cure hotplug lock ordering issues"). Many locks are involved.
To be on the safe side, text_mutex must become a low level lock taken
after cpu_hotplug_lock.rw_sem.

This can't be achieved easily with the current ftrace design.
For example, arm calls set_all_modules_text_rw() already in
ftrace_arch_code_modify_prepare(), see arch/arm/kernel/ftrace.c.
This functions is called:

  + outside stop_machine() from ftrace_run_update_code()
  + without stop_machine() from ftrace_module_enable()

Fortunately, the problematic fix is needed only on x86_64. It is
the only architecture that calls set_all_modules_text_rw()
in ftrace path and supports livepatching at the same time.

Therefore it is enough to move text_mutex handling from the generic
kernel/trace/ftrace.c into arch/x86/kernel/ftrace.c:

   ftrace_arch_code_modify_prepare()
   ftrace_arch_code_modify_post_process()

This patch basically reverts the ftrace part of the problematic
commit 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module
text permissions race"). And provides x86_64 specific-fix.

Some refactoring of the ftrace code will be needed when livepatching
is implemented for arm or nds32. These architectures call
set_all_modules_text_rw() and use stop_machine() at the same time.

Link: http://lkml.kernel.org/r/20190627081334.12793-1-pmladek@suse.com

Fixes: 9f255b632bf12c4dd7 ("module: Fix livepatch/ftrace module text permissions race")
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
[
  As reviewed by Miroslav Benes <mbenes@suse.cz>, removed return value of
  ftrace_run_update_code() as it is a void function.
]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 38277af44f5c..576c41644e77 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,7 +34,6 @@
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
 #include <linux/kprobes.h>
-#include <linux/memory.h>
 
 #include <trace/events/sched.h>
 
@@ -2611,12 +2610,10 @@ static void ftrace_run_update_code(int command)
 {
 	int ret;
 
-	mutex_lock(&text_mutex);
-
 	ret = ftrace_arch_code_modify_prepare();
 	FTRACE_WARN_ON(ret);
 	if (ret)
-		goto out_unlock;
+		return;
 
 	/*
 	 * By default we use stop_machine() to modify the code.
@@ -2628,9 +2625,6 @@ static void ftrace_run_update_code(int command)
 
 	ret = ftrace_arch_code_modify_post_process();
 	FTRACE_WARN_ON(ret);
-
-out_unlock:
-	mutex_unlock(&text_mutex);
 }
 
 static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -5784,7 +5778,6 @@ void ftrace_module_enable(struct module *mod)
 	struct ftrace_page *pg;
 
 	mutex_lock(&ftrace_lock);
-	mutex_lock(&text_mutex);
 
 	if (ftrace_disabled)
 		goto out_unlock;
@@ -5846,7 +5839,6 @@ void ftrace_module_enable(struct module *mod)
 		ftrace_arch_code_modify_post_process();
 
  out_unlock:
-	mutex_unlock(&text_mutex);
 	mutex_unlock(&ftrace_lock);
 
 	process_cached_mods(mod->name);
-- 
cgit v1.2.3


From d122ed6288d9821b405b0f84a3937955b9df545f Mon Sep 17 00:00:00 2001
From: Takeshi Misawa <jeliantsurux@gmail.com>
Date: Fri, 28 Jun 2019 19:56:40 +0900
Subject: tracing: Fix memory leak in tracing_err_log_open()

When tracing_err_log_open() calls seq_open(), allocated memory is not freed.

kmemleak report:

unreferenced object 0xffff92c0781d1100 (size 128):
  comm "tail", pid 15116, jiffies 4295163855 (age 22.704s)
  hex dump (first 32 bytes):
    00 f0 08 e5 c0 92 ff ff 00 10 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<000000000d0687d5>] kmem_cache_alloc+0x11f/0x1e0
    [<000000003e3039a8>] seq_open+0x2f/0x90
    [<000000008dd36b7d>] tracing_err_log_open+0x67/0x140
    [<000000005a431ae2>] do_dentry_open+0x1df/0x3a0
    [<00000000a2910603>] vfs_open+0x2f/0x40
    [<0000000038b0a383>] path_openat+0x2e8/0x1690
    [<00000000fe025bda>] do_filp_open+0x9b/0x110
    [<00000000483a5091>] do_sys_open+0x1ba/0x260
    [<00000000c558b5fd>] __x64_sys_openat+0x20/0x30
    [<000000006881ec07>] do_syscall_64+0x5a/0x130
    [<00000000571c2e94>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fix this by calling seq_release() in tracing_err_log_fops.release().

Link: http://lkml.kernel.org/r/20190628105640.GA1863@DESKTOP

Fixes: 8a062902be725 ("tracing: Add tracing error log")
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Takeshi Misawa <jeliantsurux@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 83e08b78dbee..4122ccde6ec2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7126,12 +7126,24 @@ static ssize_t tracing_err_log_write(struct file *file,
 	return count;
 }
 
+static int tracing_err_log_release(struct inode *inode, struct file *file)
+{
+	struct trace_array *tr = inode->i_private;
+
+	trace_array_put(tr);
+
+	if (file->f_mode & FMODE_READ)
+		seq_release(inode, file);
+
+	return 0;
+}
+
 static const struct file_operations tracing_err_log_fops = {
 	.open           = tracing_err_log_open,
 	.write		= tracing_err_log_write,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release	= tracing_release_generic_tr,
+	.release        = tracing_err_log_release,
 };
 
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
-- 
cgit v1.2.3


From 46cc0b44428d0f0e81f11ea98217fc0edfbeab07 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <devel@etsukata.com>
Date: Tue, 25 Jun 2019 10:29:10 +0900
Subject: tracing/snapshot: Resize spare buffer if size changed

Current snapshot implementation swaps two ring_buffers even though their
sizes are different from each other, that can cause an inconsistency
between the contents of buffer_size_kb file and the current buffer size.

For example:

  # cat buffer_size_kb
  7 (expanded: 1408)
  # echo 1 > events/enable
  # grep bytes per_cpu/cpu0/stats
  bytes: 1441020
  # echo 1 > snapshot             // current:1408, spare:1408
  # echo 123 > buffer_size_kb     // current:123,  spare:1408
  # echo 1 > snapshot             // current:1408, spare:123
  # grep bytes per_cpu/cpu0/stats
  bytes: 1443700
  # cat buffer_size_kb
  123                             // != current:1408

And also, a similar per-cpu case hits the following WARNING:

Reproducer:

  # echo 1 > per_cpu/cpu0/snapshot
  # echo 123 > buffer_size_kb
  # echo 1 > per_cpu/cpu0/snapshot

WARNING:

  WARNING: CPU: 0 PID: 1946 at kernel/trace/trace.c:1607 update_max_tr_single.part.0+0x2b8/0x380
  Modules linked in:
  CPU: 0 PID: 1946 Comm: bash Not tainted 5.2.0-rc6 #20
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
  RIP: 0010:update_max_tr_single.part.0+0x2b8/0x380
  Code: ff e8 dc da f9 ff 0f 0b e9 88 fe ff ff e8 d0 da f9 ff 44 89 ee bf f5 ff ff ff e8 33 dc f9 ff 41 83 fd f5 74 96 e8 b8 da f9 ff <0f> 0b eb 8d e8 af da f9 ff 0f 0b e9 bf fd ff ff e8 a3 da f9 ff 48
  RSP: 0018:ffff888063e4fca0 EFLAGS: 00010093
  RAX: ffff888066214380 RBX: ffffffff99850fe0 RCX: ffffffff964298a8
  RDX: 0000000000000000 RSI: 00000000fffffff5 RDI: 0000000000000005
  RBP: 1ffff1100c7c9f96 R08: ffff888066214380 R09: ffffed100c7c9f9b
  R10: ffffed100c7c9f9a R11: 0000000000000003 R12: 0000000000000000
  R13: 00000000ffffffea R14: ffff888066214380 R15: ffffffff99851060
  FS:  00007f9f8173c700(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000714dc0 CR3: 0000000066fa6000 CR4: 00000000000006f0
  Call Trace:
   ? trace_array_printk_buf+0x140/0x140
   ? __mutex_lock_slowpath+0x10/0x10
   tracing_snapshot_write+0x4c8/0x7f0
   ? trace_printk_init_buffers+0x60/0x60
   ? selinux_file_permission+0x3b/0x540
   ? tracer_preempt_off+0x38/0x506
   ? trace_printk_init_buffers+0x60/0x60
   __vfs_write+0x81/0x100
   vfs_write+0x1e1/0x560
   ksys_write+0x126/0x250
   ? __ia32_sys_read+0xb0/0xb0
   ? do_syscall_64+0x1f/0x390
   do_syscall_64+0xc1/0x390
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

This patch adds resize_buffer_duplicate_size() to check if there is a
difference between current/spare buffer sizes and resize a spare buffer
if necessary.

Link: http://lkml.kernel.org/r/20190625012910.13109-1-devel@etsukata.com

Cc: stable@vger.kernel.org
Fixes: ad909e21bbe69 ("tracing: Add internal tracing_snapshot() functions")
Signed-off-by: Eiichi Tsukata <devel@etsukata.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4122ccde6ec2..c3aabb576fe5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6719,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 			break;
 		}
 #endif
-		if (!tr->allocated_snapshot) {
+		if (tr->allocated_snapshot)
+			ret = resize_buffer_duplicate_size(&tr->max_buffer,
+					&tr->trace_buffer, iter->cpu_file);
+		else
 			ret = tracing_alloc_snapshot_instance(tr);
-			if (ret < 0)
-				break;
-		}
+		if (ret < 0)
+			break;
 		local_irq_disable();
 		/* Now, we're going to swap */
 		if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
-- 
cgit v1.2.3


From 97abc889ee296faf95ca0e978340fb7b942a3e32 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 28 Jun 2019 12:06:50 -0700
Subject: signal: remove the wrong signal_pending() check in
 restore_user_sigmask()

This is the minimal fix for stable, I'll send cleanups later.

Commit 854a6ed56839 ("signal: Add restore_user_sigmask()") introduced
the visible change which breaks user-space: a signal temporary unblocked
by set_user_sigmask() can be delivered even if the caller returns
success or timeout.

Change restore_user_sigmask() to accept the additional "interrupted"
argument which should be used instead of signal_pending() check, and
update the callers.

Eric said:

: For clarity.  I don't think this is required by posix, or fundamentally to
: remove the races in select.  It is what linux has always done and we have
: applications who care so I agree this fix is needed.
:
: Further in any case where the semantic change that this patch rolls back
: (aka where allowing a signal to be delivered and the select like call to
: complete) would be advantage we can do as well if not better by using
: signalfd.
:
: Michael is there any chance we can get this guarantee of the linux
: implementation of pselect and friends clearly documented.  The guarantee
: that if the system call completes successfully we are guaranteed that no
: signal that is unblocked by using sigmask will be delivered?

Link: http://lkml.kernel.org/r/20190604134117.GA29963@redhat.com
Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Eric Wong <e@80x24.org>
Tested-by: Eric Wong <e@80x24.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>	[5.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d622eac9d169..edf8915ddd54 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
  */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
+				bool interrupted)
 {
 
 	if (!usigmask)
@@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
 	 * Restoring sigmask here can lead to delivering signals that the above
 	 * syscalls are intended to block because of the sigmask passed in.
 	 */
-	if (signal_pending(current)) {
+	if (interrupted) {
 		current->saved_sigmask = *sigsaved;
 		set_restore_sigmask();
 		return;
-- 
cgit v1.2.3


From 1bf4580e00a248a2c86269125390eb3648e1877c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 28 Jun 2019 12:07:14 -0700
Subject: fork,memcg: alloc_thread_stack_node needs to set tsk->stack

Commit 5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on
memcg charge fail") corrected two instances, but there was a third
instance of this bug.

Without setting tsk->stack, if memcg_charge_kernel_stack fails, it'll
execute free_thread_stack() on a dangling pointer.

Enterprise kernels are compiled with VMAP_STACK=y so this isn't
critical, but custom VMAP_STACK=n builds should have some performance
advantage, with the drawback of risking to fail fork because compaction
didn't succeed.  So as long as VMAP_STACK=n is a supported option it's
worth fixing it upstream.

Link: http://lkml.kernel.org/r/20190619011450.28048-1-aarcange@redhat.com
Fixes: 9b6f7e163cd0 ("mm: rework memcg kernel stack accounting")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 399aca51ff75..61667909ce83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -248,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
-	return page ? page_address(page) : NULL;
+	if (likely(page)) {
+		tsk->stack = page_address(page);
+		return tsk->stack;
+	}
+	return NULL;
 #endif
 }
 
-- 
cgit v1.2.3


From 28dd29c06d0dede4b32b2c559cff21955a830928 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Mon, 1 Jul 2019 16:01:46 +0200
Subject: fork: return proper negative error code

Make sure to return a proper negative error code from copy_process()
when anon_inode_getfile() fails with CLONE_PIDFD.
Otherwise _do_fork() will not detect an error and get_task_pid() will
operator on a nonsensical pointer:

R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c
R13: 00007ffc15fbb0ff R14: 00007ff07e47e9c0 R15: 0000000000000000
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 7990 Comm: syz-executor290 Not tainted 5.2.0-rc6+ #9
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline]
RIP: 0010:get_task_pid+0xe1/0x210 kernel/pid.c:372
Code: 89 ff e8 62 27 5f 00 49 8b 07 44 89 f1 4c 8d bc c8 90 01 00 00 eb 0c
e8 0d fe 25 00 49 81 c7 38 05 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 18 00 74
08 4c 89 ff e8 31 27 5f 00 4d 8b 37 e8 f9 47 12 00
RSP: 0018:ffff88808a4a7d78 EFLAGS: 00010203
RAX: 00000000000000a7 RBX: dffffc0000000000 RCX: ffff888088180600
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff88808a4a7d90 R08: ffffffff814fb3a8 R09: ffffed1015d66bf8
R10: ffffed1015d66bf8 R11: 1ffff11015d66bf7 R12: 0000000000041ffc
R13: 1ffff11011494fbc R14: 0000000000000000 R15: 000000000000053d
FS:  00007ff07e47e700(0000) GS:ffff8880aeb00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004b5100 CR3: 0000000094df2000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  _do_fork+0x1b9/0x5f0 kernel/fork.c:2360
  __do_sys_clone kernel/fork.c:2454 [inline]
  __se_sys_clone kernel/fork.c:2448 [inline]
  __x64_sys_clone+0xc1/0xd0 kernel/fork.c:2448
  do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:301
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Link: https://lore.kernel.org/lkml/000000000000e0dc0d058c9e7142@google.com
Reported-and-tested-by: syzbot+002e636502bc4b64eb5c@syzkaller.appspotmail.com
Fixes: 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups")
Cc: Jann Horn <jannh@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 61667909ce83..fe83343da24b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2036,6 +2036,7 @@ static __latent_entropy struct task_struct *copy_process(
 					      O_RDWR | O_CLOEXEC);
 		if (IS_ERR(pidfile)) {
 			put_unused_fd(pidfd);
+			retval = PTR_ERR(pidfile);
 			goto bad_fork_free_pid;
 		}
 		get_pid(pid);	/* held by pidfile now */
-- 
cgit v1.2.3


From 6994eefb0053799d2e07cd140df6c2ea106c41ee Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Thu, 4 Jul 2019 17:32:23 +0200
Subject: ptrace: Fix ->ptracer_cred handling for PTRACE_TRACEME

Fix two issues:

When called for PTRACE_TRACEME, ptrace_link() would obtain an RCU
reference to the parent's objective credentials, then give that pointer
to get_cred().  However, the object lifetime rules for things like
struct cred do not permit unconditionally turning an RCU reference into
a stable reference.

PTRACE_TRACEME records the parent's credentials as if the parent was
acting as the subject, but that's not the case.  If a malicious
unprivileged child uses PTRACE_TRACEME and the parent is privileged, and
at a later point, the parent process becomes attacker-controlled
(because it drops privileges and calls execve()), the attacker ends up
with control over two processes with a privileged ptrace relationship,
which can be abused to ptrace a suid binary and obtain root privileges.

Fix both of these by always recording the credentials of the process
that is requesting the creation of the ptrace relationship:
current_cred() can't change under us, and current is the proper subject
for access control.

This change is theoretically userspace-visible, but I am not aware of
any code that it will actually break.

Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8456b6e2205f..705887f63288 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -79,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
  */
 static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-	rcu_read_lock();
-	__ptrace_link(child, new_parent, __task_cred(new_parent));
-	rcu_read_unlock();
+	__ptrace_link(child, new_parent, current_cred());
 }
 
 /**
-- 
cgit v1.2.3