From 3a38bb98d9abdc3856f26b5ed4332803065cd7cf Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 10 Apr 2018 09:37:32 -0700
Subject: bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog

syzbot reported a possible deadlock in perf_event_detach_bpf_prog.
The error details:
  ======================================================
  WARNING: possible circular locking dependency detected
  4.16.0-rc7+ #3 Not tainted
  ------------------------------------------------------
  syz-executor7/24531 is trying to acquire lock:
   (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854

  but task is already holding lock:
   (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #1 (&mm->mmap_sem){++++}:
       __might_fault+0x13a/0x1d0 mm/memory.c:4571
       _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
       copy_to_user include/linux/uaccess.h:155 [inline]
       bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694
       perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891
       _perf_ioctl kernel/events/core.c:4750 [inline]
       perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770
       vfs_ioctl fs/ioctl.c:46 [inline]
       do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
       SYSC_ioctl fs/ioctl.c:701 [inline]
       SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
       do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
       entry_SYSCALL_64_after_hwframe+0x42/0xb7

  -> #0 (bpf_event_mutex){+.+.}:
       lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
       __mutex_lock_common kernel/locking/mutex.c:756 [inline]
       __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
       mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
       perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
       perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
       _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
       put_event+0x24/0x30 kernel/events/core.c:4204
       perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
       remove_vma+0xb4/0x1b0 mm/mmap.c:172
       remove_vma_list mm/mmap.c:2490 [inline]
       do_munmap+0x82a/0xdf0 mm/mmap.c:2731
       mmap_region+0x59e/0x15a0 mm/mmap.c:1646
       do_mmap+0x6c0/0xe00 mm/mmap.c:1483
       do_mmap_pgoff include/linux/mm.h:2223 [inline]
       vm_mmap_pgoff+0x1de/0x280 mm/util.c:355
       SYSC_mmap_pgoff mm/mmap.c:1533 [inline]
       SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491
       SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline]
       SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91
       do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
       entry_SYSCALL_64_after_hwframe+0x42/0xb7

  other info that might help us debug this:

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&mm->mmap_sem);
                                 lock(bpf_event_mutex);
                                 lock(&mm->mmap_sem);
    lock(bpf_event_mutex);

   *** DEADLOCK ***
  ======================================================

The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow
user space to query prog array on the same tp") where copy_to_user,
which requires mm->mmap_sem, is called inside bpf_event_mutex lock.
At the same time, during perf_event file descriptor close,
mm->mmap_sem is held first and then subsequent
perf_event_detach_bpf_prog needs bpf_event_mutex lock.
Such a senario caused a deadlock.

As suggested by Daniel, moving copy_to_user out of the
bpf_event_mutex lock should fix the problem.

Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp")
Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c        | 45 +++++++++++++++++++++++++++++----------------
 kernel/trace/bpf_trace.c | 25 +++++++++++++++++++++----
 2 files changed, 50 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b393abdd..ba03ec39efb3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1572,13 +1572,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
 	return cnt;
 }
 
+static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+				     u32 *prog_ids,
+				     u32 request_cnt)
+{
+	int i = 0;
+
+	for (; *prog; prog++) {
+		if (*prog == &dummy_bpf_prog.prog)
+			continue;
+		prog_ids[i] = (*prog)->aux->id;
+		if (++i == request_cnt) {
+			prog++;
+			break;
+		}
+	}
+
+	return !!(*prog);
+}
+
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt)
 {
 	struct bpf_prog **prog;
 	unsigned long err = 0;
-	u32 i = 0, *ids;
 	bool nospc;
+	u32 *ids;
 
 	/* users of this function are doing:
 	 * cnt = bpf_prog_array_length();
@@ -1595,16 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 		return -ENOMEM;
 	rcu_read_lock();
 	prog = rcu_dereference(progs)->progs;
-	for (; *prog; prog++) {
-		if (*prog == &dummy_bpf_prog.prog)
-			continue;
-		ids[i] = (*prog)->aux->id;
-		if (++i == cnt) {
-			prog++;
-			break;
-		}
-	}
-	nospc = !!(*prog);
+	nospc = bpf_prog_array_copy_core(prog, ids, cnt);
 	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
@@ -1683,22 +1693,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 }
 
 int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
-			     __u32 __user *prog_ids, u32 request_cnt,
-			     __u32 __user *prog_cnt)
+			     u32 *prog_ids, u32 request_cnt,
+			     u32 *prog_cnt)
 {
+	struct bpf_prog **prog;
 	u32 cnt = 0;
 
 	if (array)
 		cnt = bpf_prog_array_length(array);
 
-	if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
-		return -EFAULT;
+	*prog_cnt = cnt;
 
 	/* return early if user requested only program count or nothing to copy */
 	if (!request_cnt || !cnt)
 		return 0;
 
-	return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
+	prog = rcu_dereference_check(array, 1)->progs;
+	return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+								     : 0;
 }
 
 static void bpf_prog_free_deferred(struct work_struct *work)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d4e12c..56ba0f2a01db 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	struct perf_event_query_bpf __user *uquery = info;
 	struct perf_event_query_bpf query = {};
+	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -985,16 +986,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 		return -EINVAL;
 	if (copy_from_user(&query, uquery, sizeof(query)))
 		return -EFAULT;
-	if (query.ids_len > BPF_TRACE_MAX_PROGS)
+
+	ids_len = query.ids_len;
+	if (ids_len > BPF_TRACE_MAX_PROGS)
 		return -E2BIG;
+	ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
+	if (!ids)
+		return -ENOMEM;
+	/*
+	 * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
+	 * is required when user only wants to check for uquery->prog_cnt.
+	 * There is no need to check for it since the case is handled
+	 * gracefully in bpf_prog_array_copy_info.
+	 */
 
 	mutex_lock(&bpf_event_mutex);
 	ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
-				       uquery->ids,
-				       query.ids_len,
-				       &uquery->prog_cnt);
+				       ids,
+				       ids_len,
+				       &prog_cnt);
 	mutex_unlock(&bpf_event_mutex);
 
+	if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
+	    copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
+		ret = -EFAULT;
+
+	kfree(ids);
 	return ret;
 }
 
-- 
cgit v1.2.3


From edf5c17d866eada03b8750368a12dc3def77d608 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 5 Apr 2018 16:15:55 +0200
Subject: staging: irda: remove remaining remants of irda code removal

There were some documentation locations that irda was mentioned, as well
as an old MAINTAINERS entry and the networking sysctl entries.  Clean
these all out as this stuff really is finally gone.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sysctl_binary.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8c0dab4fd65..07148b497451 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = {
 	{}
 };
 
-static const struct bin_table bin_net_irda_table[] = {
-	{ CTL_INT,	NET_IRDA_DISCOVERY,		"discovery" },
-	{ CTL_STR,	NET_IRDA_DEVNAME,		"devname" },
-	{ CTL_INT,	NET_IRDA_DEBUG,			"debug" },
-	{ CTL_INT,	NET_IRDA_FAST_POLL,		"fast_poll_increase" },
-	{ CTL_INT,	NET_IRDA_DISCOVERY_SLOTS,	"discovery_slots" },
-	{ CTL_INT,	NET_IRDA_DISCOVERY_TIMEOUT,	"discovery_timeout" },
-	{ CTL_INT,	NET_IRDA_SLOT_TIMEOUT,		"slot_timeout" },
-	{ CTL_INT,	NET_IRDA_MAX_BAUD_RATE,		"max_baud_rate" },
-	{ CTL_INT,	NET_IRDA_MIN_TX_TURN_TIME,	"min_tx_turn_time" },
-	{ CTL_INT,	NET_IRDA_MAX_TX_DATA_SIZE,	"max_tx_data_size" },
-	{ CTL_INT,	NET_IRDA_MAX_TX_WINDOW,		"max_tx_window" },
-	{ CTL_INT,	NET_IRDA_MAX_NOREPLY_TIME,	"max_noreply_time" },
-	{ CTL_INT,	NET_IRDA_WARN_NOREPLY_TIME,	"warn_noreply_time" },
-	{ CTL_INT,	NET_IRDA_LAP_KEEPALIVE_TIME,	"lap_keepalive_time" },
-	{}
-};
-
 static const struct bin_table bin_net_table[] = {
 	{ CTL_DIR,	NET_CORE,		"core",		bin_net_core_table },
 	/* NET_ETHER not used */
@@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = {
 	{ CTL_DIR,	NET_LLC,		"llc",		bin_net_llc_table },
 	{ CTL_DIR,	NET_NETFILTER,		"netfilter",	bin_net_netfilter_table },
 	/* NET_DCCP "dccp" no longer used */
-	{ CTL_DIR,	NET_IRDA,		"irda",		bin_net_irda_table },
+	/* NET_IRDA "irda" no longer used */
 	{ CTL_INT,	2089,			"nf_conntrack_max" },
 	{}
 };
-- 
cgit v1.2.3


From 5c8dad48e4f53d6fd0a7e4f95d7c1c983374de88 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 13 Apr 2018 11:55:13 -0700
Subject: trace_kprobe: Remove warning message "Could not insert probe at..."

This warning message is not very helpful, as the return value should
already show information about the error. Also, this message will
spam dmesg if the user space does testing in a loop, like:

    for x in {0..5}
    do
        echo p:xx xx+$x >> /sys/kernel/debug/tracing/kprobe_events
    done

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Link: http://lkml.kernel.org/r/20180413185513.3626052-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_kprobe.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1cd3fb4d70f8..02aed76e0978 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
 	if (ret == 0)
 		tk->tp.flags |= TP_FLAG_REGISTERED;
 	else {
-		pr_warn("Could not insert probe at %s+%lu: %d\n",
-			trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
 		if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
 			pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
 			ret = 0;
-- 
cgit v1.2.3


From e91c2518a5d22a07642f35d85f39001ad379dae4 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:46 +0200
Subject: livepatch: Initialize shadow variables safely by a custom callback

The existing API allows to pass a sample data to initialize the shadow
data. It works well when the data are position independent. But it fails
miserably when we need to set a pointer to the shadow structure itself.

Unfortunately, we might need to initialize the pointer surprisingly
often because of struct list_head. It is even worse because the list
might be hidden in other common structures, for example, struct mutex,
struct wait_queue_head.

For example, this was needed to fix races in ALSA sequencer. It required
to add mutex into struct snd_seq_client. See commit b3defb791b26ea06
("ALSA: seq: Make ioctls race-free") and commit d15d662e89fc667b9
("ALSA: seq: Fix racy pool initializations")

This patch makes the API more safe. A custom constructor function and data
are passed to klp_shadow_*alloc() functions instead of the sample data.

Note that ctor_data are no longer a template for shadow->data. It might
point to any data that might be necessary when the constructor is called.

Also note that the constructor is called under klp_shadow_lock. It is
an internal spin_lock that synchronizes alloc() vs. get() operations,
see klp_shadow_get_or_alloc(). On one hand, this adds a risk of ABBA
deadlocks. On the other hand, it allows to do some operations safely.
For example, we could add the new structure into an existing list.
This must be done only once when the structure is allocated.

Reported-by: Nicolai Stange <nstange@suse.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 82 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index fdac27588d60..b10a0bbb7f84 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+				       size_t size, gfp_t gfp_flags,
+				       klp_shadow_ctor_t ctor, void *ctor_data,
+				       bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
 	void *shadow_data;
@@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 	if (shadow_data)
 		goto exists;
 
-	/* Allocate a new shadow variable for use inside the lock below */
+	/*
+	 * Allocate a new shadow variable.  Fill it with zeroes by default.
+	 * More complex setting can be done by @ctor function.  But it is
+	 * called only when the buffer is really used (under klp_shadow_lock).
+	 */
 	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
 	if (!new_shadow)
 		return NULL;
 
-	new_shadow->obj = obj;
-	new_shadow->id = id;
-
-	/* Initialize the shadow variable if data provided */
-	if (data)
-		memcpy(new_shadow->data, data, size);
-
 	/* Look for <obj, id> again under the lock */
 	spin_lock_irqsave(&klp_shadow_lock, flags);
 	shadow_data = klp_shadow_get(obj, id);
@@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		goto exists;
 	}
 
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	if (ctor) {
+		int err;
+
+		err = ctor(obj, new_shadow->data, ctor_data);
+		if (err) {
+			spin_unlock_irqrestore(&klp_shadow_lock, flags);
+			kfree(new_shadow);
+			pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+			       obj, id, err);
+			return NULL;
+		}
+	}
+
 	/* No <obj, id> found, so attach the newly allocated one */
 	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
 		     (unsigned long)new_shadow->obj);
@@ -170,26 +185,32 @@ exists:
  * klp_shadow_alloc() - allocate and add a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default.  They are further initialized by @ctor
+ * function if it is not NULL.  The new shadow variable is then added
+ * to the global hashtable.
  *
- * Allocates @size bytes for new shadow variable data using @gfp_flags
- * and copies @size bytes from @data into the new shadow variable's own
- * data space.  If @data is NULL, @size bytes are still allocated, but
- * no copy is performed.  The new shadow variable is then added to the
- * global hashtable.
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
  *
- * If an existing <obj, id> shadow variable can be found, this routine
- * will issue a WARN, exit early and return NULL.
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before.  The cost is that @ctor is called
+ * in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on duplicate or
  * failure.
  */
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags)
+void *klp_shadow_alloc(void *obj, unsigned long id,
+		       size_t size, gfp_t gfp_flags,
+		       klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, true);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_alloc);
 
@@ -197,25 +218,28 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc);
  * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
  *
  * Returns a pointer to existing shadow data if an <obj, id> shadow
  * variable is already present.  Otherwise, it creates a new shadow
  * variable like klp_shadow_alloc().
  *
- * This function guarantees that only one shadow variable exists with
- * the given @id for the given @obj.  It also guarantees that the shadow
- * variable will be initialized by the given @data only when it did not
- * exist before.
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj.  It also guarantees that the constructor function
+ * will be called only when the variable did not exist before.  The cost is
+ * that @ctor is called in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on failure.
  */
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-			       size_t size, gfp_t gfp_flags)
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+			      size_t size, gfp_t gfp_flags,
+			      klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, false);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
-- 
cgit v1.2.3


From 3b2c77d000fe9f7d02e9e726e00dccf9f92b256f Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:47 +0200
Subject: livepatch: Allow to call a custom callback when freeing shadow
 variables

We might need to do some actions before the shadow variable is freed.
For example, we might need to remove it from a list or free some data
that it points to.

This is already possible now. The user can get the shadow variable
by klp_shadow_get(), do the necessary actions, and then call
klp_shadow_free().

This patch allows to do it a more elegant way. The user could implement
the needed actions in a callback that is passed to klp_shadow_free()
as a parameter. The callback usually does reverse operations to
the constructor callback that can be called by klp_shadow_*alloc().

It is especially useful for klp_shadow_free_all(). There we need to do
these extra actions for each found shadow variable with the given ID.

Note that the memory used by the shadow variable itself is still released
later by rcu callback. It is needed to protect internal structures that
keep all shadow variables. But the destructor is called immediately.
The shadow variable must not be access anyway after klp_shadow_free()
is called. The user is responsible to protect this any suitable way.

Be aware that the destructor is called under klp_shadow_lock. It is
the same as for the contructor in klp_shadow_alloc().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index b10a0bbb7f84..83958c814439 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -243,15 +243,26 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+				   klp_shadow_dtor_t dtor)
+{
+	hash_del_rcu(&shadow->node);
+	if (dtor)
+		dtor(shadow->obj, shadow->data);
+	kfree_rcu(shadow, rcu_head);
+}
+
 /**
  * klp_shadow_free() - detach and free a <obj, id> shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for this <obj, id> shadow variable
  * instance, callers should stop referencing it accordingly.
  */
-void klp_shadow_free(void *obj, unsigned long id)
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -263,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id)
 			       (unsigned long)obj) {
 
 		if (klp_shadow_match(shadow, obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
+			klp_shadow_free_struct(shadow, dtor);
 			break;
 		}
 	}
@@ -276,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free);
 /**
  * klp_shadow_free_all() - detach and free all <*, id> shadow variables
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for all <*, id> shadow variable
  * instances, callers should stop referencing them accordingly.
  */
-void klp_shadow_free_all(unsigned long id)
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -290,10 +302,8 @@ void klp_shadow_free_all(unsigned long id)
 
 	/* Delete all <*, id> from hash */
 	hash_for_each(klp_shadow_hash, i, shadow, node) {
-		if (klp_shadow_match(shadow, shadow->obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
-		}
+		if (klp_shadow_match(shadow, shadow->obj, id))
+			klp_shadow_free_struct(shadow, dtor);
 	}
 
 	spin_unlock_irqrestore(&klp_shadow_lock, flags);
-- 
cgit v1.2.3


From 101592b4904ecf6b8ed2a4784d41d180319d95a1 Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Mon, 9 Apr 2018 10:25:32 +0300
Subject: perf/core: Store context switch out type in
 PERF_RECORD_SWITCH[_CPU_WIDE]

Store preempting context switch out event into Perf trace as a part of
PERF_RECORD_SWITCH[_CPU_WIDE] record.

Percentage of preempting and non-preempting context switches help
understanding the nature of workloads (CPU or IO bound) that are running
on a machine;

The event is treated as preemption one when task->state value of the
thread being switched out is TASK_RUNNING. Event type encoding is
implemented using PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit;

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/9ff84e83-a0ca-dd82-a6d0-cb951689be74@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2d5fe26551f8..1bae80aaabfb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7587,6 +7587,10 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
+	if (!sched_in && task->state == TASK_RUNNING)
+		switch_event.event_id.header.misc |=
+				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
 	perf_iterate_sb(perf_event_switch_output,
 		       &switch_event,
 		       NULL);
-- 
cgit v1.2.3


From 78b562fbfa2cf0a9fcb23c3154756b690f4905c1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:50 +0200
Subject: perf: Return proper values for user stack errors

Return immediately when we find issue in the user stack checks. The
error value could get overwritten by following check for
PERF_SAMPLE_REGS_INTR.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Fixes: 60e2364e60e8 ("perf: Add ability to sample machine state on interrupt")
Link: http://lkml.kernel.org/r/20180415092352.12403-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1bae80aaabfb..67612ce359ad 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10209,9 +10209,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 		 * __u16 sample size limit.
 		 */
 		if (attr->sample_stack_user >= USHRT_MAX)
-			ret = -EINVAL;
+			return -EINVAL;
 		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
-			ret = -EINVAL;
+			return -EINVAL;
 	}
 
 	if (!attr->sample_max_stack)
-- 
cgit v1.2.3


From 5af44ca53d019de47efe6dbc4003dd518e5197ed Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:51 +0200
Subject: perf: Fix sample_max_stack maximum check

The syzbot hit KASAN bug in perf_callchain_store having the entry stored
behind the allocated bounds [1].

We miss the sample_max_stack check for the initial event that allocates
callchain buffers. This missing check allows to create an event with
sample_max_stack value bigger than the global sysctl maximum:

  # sysctl -a | grep perf_event_max_stack
  kernel.perf_event_max_stack = 127

  # perf record -vv -C 1 -e cycles/max-stack=256/ kill
  ...
  perf_event_attr:
    size                             112
    ...
    sample_max_stack                 256
  ------------------------------------------------------------
  sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 4

Note the '-C 1', which forces perf record to create just single event.
Otherwise it opens event for every cpu, then the sample_max_stack check
fails on the second event and all's fine.

The fix is to run the sample_max_stack check also for the first event
with callchains.

[1] https://marc.info/?l=linux-kernel&m=152352732920874&w=2

Reported-by: syzbot+7c449856228b63ac951e@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Fixes: 97c79a38cd45 ("perf core: Per event callchain limit")
Link: http://lkml.kernel.org/r/20180415092352.12403-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 772a43fea825..73cc26e321de 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -119,19 +119,22 @@ int get_callchain_buffers(int event_max_stack)
 		goto exit;
 	}
 
+	/*
+	 * If requesting per event more than the global cap,
+	 * return a different error to help userspace figure
+	 * this out.
+	 *
+	 * And also do it here so that we have &callchain_mutex held.
+	 */
+	if (event_max_stack > sysctl_perf_event_max_stack) {
+		err = -EOVERFLOW;
+		goto exit;
+	}
+
 	if (count > 1) {
 		/* If the allocation failed, give up */
 		if (!callchain_cpus_entries)
 			err = -ENOMEM;
-		/*
-		 * If requesting per event more than the global cap,
-		 * return a different error to help userspace figure
-		 * this out.
-		 *
-		 * And also do it here so that we have &callchain_mutex held.
-		 */
-		if (event_max_stack > sysctl_perf_event_max_stack)
-			err = -EOVERFLOW;
 		goto exit;
 	}
 
-- 
cgit v1.2.3


From bfb3d7b8b906b66551424d7636182126e1d134c8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 15 Apr 2018 11:23:52 +0200
Subject: perf: Remove superfluous allocation error check

If the get_callchain_buffers fails to allocate the buffer it will
decrease the nr_callchain_events right away.

There's no point of checking the allocation error for
nr_callchain_events > 1. Removing that check.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: syzkaller-bugs@googlegroups.com
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/20180415092352.12403-3-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 73cc26e321de..c187aa3df3c8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -131,14 +131,8 @@ int get_callchain_buffers(int event_max_stack)
 		goto exit;
 	}
 
-	if (count > 1) {
-		/* If the allocation failed, give up */
-		if (!callchain_cpus_entries)
-			err = -ENOMEM;
-		goto exit;
-	}
-
-	err = alloc_callchain_buffers();
+	if (count == 1)
+		err = alloc_callchain_buffers();
 exit:
 	if (err)
 		atomic_dec(&nr_callchain_events);
-- 
cgit v1.2.3


From 4450dc0ae2c18a0ac6dce560215c7a1fa12122b5 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 5 Apr 2018 17:26:58 +0200
Subject: clockevents: Fix kernel messages split across multiple lines

Convert the clockevents driver from old-style printk() to pr_info() and
pr_cont(), to fix split kernel messages like below:

    Clockevents: could not switch to one-shot mode:
     dummy_timer is not functional.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: https://lkml.kernel.org/r/1522942018-14471-1-git-send-email-geert%2Brenesas@glider.be
---
 kernel/time/tick-oneshot.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index c1f518e7aa80..6fe615d57ebb 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
 		    !tick_device_is_functional(dev)) {
 
-		printk(KERN_INFO "Clockevents: "
-		       "could not switch to one-shot mode:");
+		pr_info("Clockevents: could not switch to one-shot mode:");
 		if (!dev) {
-			printk(" no tick device\n");
+			pr_cont(" no tick device\n");
 		} else {
 			if (!tick_device_is_functional(dev))
-				printk(" %s is not functional.\n", dev->name);
+				pr_cont(" %s is not functional.\n", dev->name);
 			else
-				printk(" %s does not support one-shot mode.\n",
-				       dev->name);
+				pr_cont(" %s does not support one-shot mode.\n",
+					dev->name);
 		}
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From e142aa09ed88be98395dde7acb96fb2263566b68 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Fri, 13 Apr 2018 13:27:58 +0800
Subject: timekeeping: Remove __current_kernel_time()

The __current_kernel_time() function based on 'struct timespec' is no
longer recommended for new code, and the only user of this function has
been replaced by commit 6909e29fdefb ("kdb: use __ktime_get_real_seconds
instead of __current_kernel_time").

Remove the obsolete interface.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: sboyd@kernel.org
Cc: broonie@kernel.org
Cc: john.stultz@linaro.org
Link: https://lkml.kernel.org/r/1a9dbea7ee2cda7efe9ed330874075cf17fdbff6.1523596316.git.baolin.wang@linaro.org
---
 kernel/time/timekeeping.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ca90219a1e73..dcf7f20fcd12 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2139,13 +2139,6 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
-struct timespec __current_kernel_time(void)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-
-	return timespec64_to_timespec(tk_xtime(tk));
-}
-
 struct timespec64 current_kernel_time64(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-- 
cgit v1.2.3


From be71eda5383faa663efdba9ef54a6b8255e3c7f0 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 18 Apr 2018 09:14:36 +0200
Subject: module: Fix display of wrong module .text address

Reading file /proc/modules shows the correct address:
[root@s35lp76 ~]# cat /proc/modules | egrep '^qeth_l2'
qeth_l2 94208 1 - Live 0x000003ff80401000

and reading file /sys/module/qeth_l2/sections/.text
[root@s35lp76 ~]# cat /sys/module/qeth_l2/sections/.text
0x0000000018ea8363
displays a random address.

This breaks the perf tool which uses this address on s390
to calculate start of .text section in memory.

Fix this by printing the correct (unhashed) address.

Thanks to Jessica Yu for helping on this.

Fixes: ef0010a30935 ("vsprintf: don't use 'restricted_pointer()' when not restricting")
Cc: <stable@vger.kernel.org> # v4.15+
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Jessica Yu <jeyu@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index a6e43a5806a1..ce8066b88178 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1472,7 +1472,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
 	struct module_sect_attr *sattr =
 		container_of(mattr, struct module_sect_attr, mattr);
-	return sprintf(buf, "0x%pK\n", (void *)sattr->address);
+	return sprintf(buf, "0x%px\n", kptr_restrict < 2 ?
+		       (void *)sattr->address : NULL);
 }
 
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-- 
cgit v1.2.3


From c3bca5d450b620dd3d36e14b5e1f43639fd47d6b Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 17 Apr 2018 14:57:42 -0700
Subject: posix-cpu-timers: Ensure set_process_cpu_timer is always evaluated

Commit a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer()
more robust") moved the check into the 'if' statement. Unfortunately,
it did so on the right side of an && which means that it may get short
circuited and never evaluated. This is easily reproduced with:

$ cat loop.c
void main() {
  struct rlimit res;
  /* set the CPU time limit */
  getrlimit(RLIMIT_CPU,&res);
  res.rlim_cur = 2;
  res.rlim_max = 2;
  setrlimit(RLIMIT_CPU,&res);

  while (1);
}

Which will hang forever instead of being killed. Fix this by pulling the
evaluation out of the if statement but checking the return value instead.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1568337
Fixes: a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer() more robust")
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Cc: "Max R . P . Grossmann" <m@max.pm>
Cc: John Stultz <john.stultz@linaro.org>
Link: https://lkml.kernel.org/r/20180417215742.2521-1-labbott@redhat.com
---
 kernel/time/posix-cpu-timers.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2541bd89f20e..5a6251ac6f7a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   u64 *newval, u64 *oldval)
 {
 	u64 now;
+	int ret;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+	ret = cpu_timer_sample_group(clock_idx, tsk, &now);
 
-	if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
+	if (oldval && ret != -EINVAL) {
 		/*
 		 * We are setting itimer. The *oldval is absolute and we update
 		 * it to be relative, *newval argument is relative and we update
-- 
cgit v1.2.3


From 6ab690aa439803347743c0d899ac422774fdd5e7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 20 Apr 2018 18:16:30 +0200
Subject: bpf: sockmap remove dead check

Remove dead code that bails on `attr->value_size > KMALLOC_MAX_SIZE` - the
previous check already bails on `attr->value_size != 4`.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 8dd9210d7db7..a3b21385e947 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1442,9 +1442,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->value_size > KMALLOC_MAX_SIZE)
-		return ERR_PTR(-E2BIG);
-
 	err = bpf_tcp_ulp_register();
 	if (err && err != -EEXIST)
 		return ERR_PTR(err);
-- 
cgit v1.2.3


From e01e80634ecdde1dd113ac43b3adad21b47f3957 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 20 Apr 2018 14:55:31 -0700
Subject: fork: unconditionally clear stack on fork

One of the classes of kernel stack content leaks[1] is exposing the
contents of prior heap or stack contents when a new process stack is
allocated.  Normally, those stacks are not zeroed, and the old contents
remain in place.  In the face of stack content exposure flaws, those
contents can leak to userspace.

Fixing this will make the kernel no longer vulnerable to these flaws, as
the stack will be wiped each time a stack is assigned to a new process.
There's not a meaningful change in runtime performance; it almost looks
like it provides a benefit.

Performing back-to-back kernel builds before:
	Run times: 157.86 157.09 158.90 160.94 160.80
	Mean: 159.12
	Std Dev: 1.54

and after:
	Run times: 159.31 157.34 156.71 158.15 160.81
	Mean: 158.46
	Std Dev: 1.46

Instead of making this a build or runtime config, Andy Lutomirski
recommended this just be enabled by default.

[1] A noisy search for many kinds of stack content leaks can be seen here:
https://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=linux+kernel+stack+leak

I did some more with perf and cycle counts on running 100,000 execs of
/bin/true.

before:
Cycles: 218858861551 218853036130 214727610969 227656844122 224980542841
Mean:  221015379122.60
Std Dev: 4662486552.47

after:
Cycles: 213868945060 213119275204 211820169456 224426673259 225489986348
Mean:  217745009865.40
Std Dev: 5935559279.99

It continues to look like it's faster, though the deviation is rather
wide, but I'm not sure what I could do that would be less noisy.  I'm
open to ideas!

Link: http://lkml.kernel.org/r/20180221021659.GA37073@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 242c8c93d285..a5d21c42acfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-#ifdef CONFIG_DEBUG_KMEMLEAK
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
-#endif
+
 		tsk->stack_vm_area = s;
 		return s->addr;
 	}
-- 
cgit v1.2.3


From ba6b8de423f8d0dee48d6030288ed81c03ddf9f0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:23 -0700
Subject: bpf: sockmap, map_release does not hold refcnt for pinned maps

Relying on map_release hook to decrement the reference counts when a
map is removed only works if the map is not being pinned. In the
pinned case the ref is decremented immediately and the BPF programs
released. After this BPF programs may not be in-use which is not
what the user would expect.

This patch moves the release logic into bpf_map_put_uref() and brings
sockmap in-line with how a similar case is handled in prog array maps.

Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/arraymap.c | 3 ++-
 kernel/bpf/sockmap.c  | 4 ++--
 kernel/bpf/syscall.c  | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..027107f4be53 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+	.map_release_uref = bpf_fd_array_map_clear,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a3b21385e947..a73d484b6e4c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1831,7 +1831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_prog *orig;
@@ -1855,7 +1855,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_release = sock_map_release,
+	.map_release_uref = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..ebfe9f29dae8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -257,8 +257,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
 static void bpf_map_put_uref(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->usercnt)) {
-		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
-			bpf_fd_array_map_clear(map);
+		if (map->ops->map_release_uref)
+			map->ops->map_release_uref(map);
 	}
 }
 
-- 
cgit v1.2.3


From e20f7334837ae47341d8ec4e3170d0b4336a3676 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:28 -0700
Subject: bpf: sockmap, sk_wait_event needed to handle blocking cases

In the recvmsg handler we need to add a wait event to support the
blocking use cases. Without this we return zero and may confuse
user applications. In the wait event any data received on the
sk either via sk_receive_queue or the psock ingress list will
wake up the sock.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a73d484b6e4c..aaf50ec77c94 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,6 +43,7 @@
 #include <net/tcp.h>
 #include <linux/ptr_ring.h>
 #include <net/inet_common.h>
+#include <linux/sched/signal.h>
 
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -732,6 +733,26 @@ out_err:
 	return err;
 }
 
+static int bpf_wait_data(struct sock *sk,
+			 struct smap_psock *psk, int flags,
+			 long timeo, int *err)
+{
+	int rc;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	rc = sk_wait_event(sk, &timeo,
+			   !list_empty(&psk->ingress) ||
+			   !skb_queue_empty(&sk->sk_receive_queue),
+			   &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	return rc;
+}
+
 static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int nonblock, int flags, int *addr_len)
 {
@@ -755,6 +776,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
 	lock_sock(sk);
+bytes_ready:
 	while (copied != len) {
 		struct scatterlist *sg;
 		struct sk_msg_buff *md;
@@ -809,6 +831,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 	}
 
+	if (!copied) {
+		long timeo;
+		int data;
+		int err = 0;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+		if (data) {
+			if (!skb_queue_empty(&sk->sk_receive_queue)) {
+				release_sock(sk);
+				smap_release_sock(psock, sk);
+				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+				return copied;
+			}
+			goto bytes_ready;
+		}
+
+		if (err)
+			copied = err;
+	}
+
 	release_sock(sk);
 	smap_release_sock(psock, sk);
 	return copied;
-- 
cgit v1.2.3


From 4fcfdfb83391c74e62683469289db42a143440ac Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:33 -0700
Subject: bpf: sockmap, fix double page_put on ENOMEM error in redirect path

In the case where the socket memory boundary is hit the redirect
path returns an ENOMEM error. However, before checking for this
condition the redirect scatterlist buffer is setup with a valid
page and length. This is never unwound so when the buffers are
released latter in the error path we do a put_page() and clear
the scatterlist fields. But, because the initial error happens
before completing the scatterlist buffer we end up with both the
original buffer and the redirect buffer pointing to the same page
resulting in duplicate put_page() calls.

To fix this simply move the initial configuration of the redirect
scatterlist buffer below the sock memory check.

Found this while running TCP_STREAM test with netperf using Cilium.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index aaf50ec77c94..634415c7fbcd 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -524,8 +524,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 	i = md->sg_start;
 
 	do {
-		r->sg_data[i] = md->sg_data[i];
-
 		size = (apply && apply_bytes < md->sg_data[i].length) ?
 			apply_bytes : md->sg_data[i].length;
 
@@ -536,6 +534,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 		}
 
 		sk_mem_charge(sk, size);
+		r->sg_data[i] = md->sg_data[i];
 		r->sg_data[i].length = size;
 		md->sg_data[i].length -= size;
 		md->sg_data[i].offset += size;
-- 
cgit v1.2.3


From ba16293dad626d3e3827cfe0a1a743c1d93e76b7 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Date: Fri, 20 Apr 2018 20:37:58 +0530
Subject: tracing: Fix kernel crash while using empty filter with perf

Kernel is crashing when user tries to record 'ftrace:function' event
with empty filter:

  # perf record -e ftrace:function --filter="" ls

  # dmesg
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
  Oops: 0000 [#1] SMP PTI
  ...
  RIP: 0010:ftrace_profile_set_filter+0x14b/0x2d0
  RSP: 0018:ffffa4a7c0da7d20 EFLAGS: 00010246
  RAX: ffffa4a7c0da7d64 RBX: 0000000000000000 RCX: 0000000000000006
  RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffff8c48ffc968f0
  ...
  Call Trace:
   _perf_ioctl+0x54a/0x6b0
   ? rcu_all_qs+0x5/0x30
  ...

After patch:
  # perf record -e ftrace:function --filter="" ls
  failed to set filter "" on event ftrace:function with 22 (Invalid argument)

Also, if user tries to echo "" > filter, it used to throw an error.
This behavior got changed by commit 80765597bc58 ("tracing: Rewrite
filter logic to be simpler and faster"). This patch restores the
behavior as a side effect:

Before patch:
  # echo "" > filter
  #

After patch:
  # echo "" > filter
  bash: echo: write error: Invalid argument
  #

Link: http://lkml.kernel.org/r/20180420150758.19787-1-ravi.bangoria@linux.ibm.com

Fixes: 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster")
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9b4716bb8bb0..1f951b3df60c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1499,14 +1499,14 @@ static int process_preds(struct trace_event_call *call,
 		return ret;
 	}
 
-	if (!nr_preds) {
-		prog = NULL;
-	} else {
-		prog = predicate_parse(filter_string, nr_parens, nr_preds,
+	if (!nr_preds)
+		return -EINVAL;
+
+	prog = predicate_parse(filter_string, nr_parens, nr_preds,
 			       parse_pred, call, pe);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-	}
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
 	rcu_assign_pointer(filter->prog, prog);
 	return 0;
 }
-- 
cgit v1.2.3


From bcbd385b61bbdef3491d662203ac2e8186e5be59 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Thu, 19 Apr 2018 12:55:56 +0200
Subject: kprobes: Fix random address output of blacklist file

File /sys/kernel/debug/kprobes/blacklist displays random addresses:

[root@s8360046 linux]# cat /sys/kernel/debug/kprobes/blacklist
0x0000000047149a90-0x00000000bfcb099a	print_type_x8
....

This breaks 'perf probe' which uses the blacklist file to prohibit
probes on certain functions by checking the address range.

Fix this by printing the correct (unhashed) address.

The file mode is read all but this is not an issue as the file
hierarchy points out:
 # ls -ld /sys/ /sys/kernel/ /sys/kernel/debug/ /sys/kernel/debug/kprobes/
	/sys/kernel/debug/kprobes/blacklist
dr-xr-xr-x 12 root root 0 Apr 19 07:56 /sys/
drwxr-xr-x  8 root root 0 Apr 19 07:56 /sys/kernel/
drwx------ 16 root root 0 Apr 19 06:56 /sys/kernel/debug/
drwxr-xr-x  2 root root 0 Apr 19 06:56 /sys/kernel/debug/kprobes/
-r--r--r--  1 root root 0 Apr 19 06:56 /sys/kernel/debug/kprobes/blacklist

Everything in and below /sys/kernel/debug is rwx to root only,
no group or others have access.

Background:
Directory /sys/kernel/debug/kprobes is created by debugfs_create_dir()
which sets the mode bits to rwxr-xr-x. Maybe change that to use the
parent's directory mode bits instead?

Link: http://lkml.kernel.org/r/20180419105556.86664-1-tmricht@linux.ibm.com

Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Cc: stable@vger.kernel.org
Cc: <stable@vger.kernel.org> # v4.15+
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S Miller <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: acme@kernel.org

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 102160ff5c66..ea619021d901 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
 	struct kprobe_blacklist_entry *ent =
 		list_entry(v, struct kprobe_blacklist_entry, list);
 
-	seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+	seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
 		   (void *)ent->end_addr, (void *)ent->start_addr);
 	return 0;
 }
-- 
cgit v1.2.3


From 9a0fd675304d410f3a9586e1b333e16f4658d56c Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 15 Mar 2018 14:06:39 +0800
Subject: tracing: Fix missing tab for hwlat_detector print format

It's been missing for a while but no one is touching that up.  Fix it.

Link: http://lkml.kernel.org/r/20180315060639.9578-1-peterx@redhat.com

CC: Ingo Molnar <mingo@kernel.org>
Cc:stable@vger.kernel.org
Fixes: 7b2c86250122d ("tracing: Add NMI tracing in hwlat detector")
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_entries.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e954ae3d82c0..e3a658bac10f 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -356,7 +356,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
 		__field(	unsigned int,		seqnum		)
 	),
 
-	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
 		 __entry->seqnum,
 		 __entry->tv_sec,
 		 __entry->tv_nsec,
-- 
cgit v1.2.3


From 1f71addd34f4c442bec7d7c749acc1beb58126f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Apr 2018 21:22:18 +0200
Subject: tick/sched: Do not mess with an enqueued hrtimer

Kaike reported that in tests rdma hrtimers occasionaly stopped working. He
did great debugging, which provided enough context to decode the problem.

CPU 3			     	      	     CPU 2

idle
start sched_timer expires = 712171000000
 queue->next = sched_timer
					    start rdmavt timer. expires = 712172915662
					    lock(baseof(CPU3))
tick_nohz_stop_tick()
tick = 716767000000			    timerqueue_add(tmr)

hrtimer_set_expires(sched_timer, tick);
  sched_timer->expires = 716767000000  <---- FAIL
					     if (tmr->expires < queue->next->expires)
hrtimer_start(sched_timer)		          queue->next = tmr;
lock(baseof(CPU3))
					     unlock(baseof(CPU3))
timerqueue_remove()
timerqueue_add()

ts->sched_timer is queued and queue->next is pointing to it, but then
ts->sched_timer.expires is modified.

This not only corrupts the ordering of the timerqueue RB tree, it also
makes CPU2 see the new expiry time of timerqueue->next->expires when
checking whether timerqueue->next needs to be updated. So CPU2 sees that
the rdma timer is earlier than timerqueue->next and sets the rdma timer as
new next.

Depending on whether it had also seen the new time at RB tree enqueue, it
might have queued the rdma timer at the wrong place and then after removing
the sched_timer the RB tree is completely hosed.

The problem was introduced with a commit which tried to solve inconsistency
between the hrtimer in the tick_sched data and the underlying hardware
clockevent. It split out hrtimer_set_expires() to store the new tick time
in both the NOHZ and the NOHZ + HIGHRES case, but missed the fact that in
the NOHZ + HIGHRES case the hrtimer might still be queued.

Use hrtimer_start(timer, tick...) for the NOHZ + HIGHRES case which sets
timer->expires after canceling the timer and move the hrtimer_set_expires()
invocation into the NOHZ only code path which is not affected as it merily
uses the hrtimer as next event storage so code pathes can be shared with
the NOHZ + HIGHRES case.

Fixes: d4af6d933ccf ("nohz: Fix spurious warning when hrtimer and clockevent get out of sync")
Reported-by: "Wan Kaike" <kaike.wan@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Cc: "Marciniszyn Mike" <mike.marciniszyn@intel.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: linux-rdma@vger.kernel.org
Cc: "Dalessandro Dennis" <dennis.dalessandro@intel.com>
Cc: "Fleck John" <john.fleck@intel.com>
Cc: stable@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Weiny Ira" <ira.weiny@intel.com>
Cc: "linux-rdma@vger.kernel.org"
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804241637390.1679@nanos.tec.linutronix.de
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804242119210.1597@nanos.tec.linutronix.de
---
 kernel/time/tick-sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 646645e981f9..d31bec177fa5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -804,12 +804,12 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 		return;
 	}
 
-	hrtimer_set_expires(&ts->sched_timer, tick);
-
-	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
-	else
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+	} else {
+		hrtimer_set_expires(&ts->sched_timer, tick);
 		tick_program_event(tick, 1);
+	}
 }
 
 static void tick_nohz_retain_tick(struct tick_sched *ts)
-- 
cgit v1.2.3


From a3ed0e4393d6885b4af7ce84b437dc696490a530 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Apr 2018 15:33:38 +0200
Subject: Revert: Unify CLOCK_MONOTONIC and CLOCK_BOOTTIME

Revert commits

92af4dcb4e1c ("tracing: Unify the "boot" and "mono" tracing clocks")
127bfa5f4342 ("hrtimer: Unify MONOTONIC and BOOTTIME clock behavior")
7250a4047aa6 ("posix-timers: Unify MONOTONIC and BOOTTIME clock behavior")
d6c7270e913d ("timekeeping: Remove boot time specific code")
f2d6fdbfd238 ("Input: Evdev - unify MONOTONIC and BOOTTIME clock behavior")
d6ed449afdb3 ("timekeeping: Make the MONOTONIC clock behave like the BOOTTIME clock")
72199320d49d ("timekeeping: Add the new CLOCK_MONOTONIC_ACTIVE clock")

As stated in the pull request for the unification of CLOCK_MONOTONIC and
CLOCK_BOOTTIME, it was clear that we might have to revert the change.

As reported by several folks systemd and other applications rely on the
documented behaviour of CLOCK_MONOTONIC on Linux and break with the above
changes. After resume daemons time out and other timeout related issues are
observed. Rafael compiled this list:

* systemd kills daemons on resume, after >WatchdogSec seconds
  of suspending (Genki Sky).  [Verified that that's because systemd uses
  CLOCK_MONOTONIC and expects it to not include the suspend time.]

* systemd-journald misbehaves after resume:
  systemd-journald[7266]: File /var/log/journal/016627c3c4784cd4812d4b7e96a34226/system.journal
corrupted or uncleanly shut down, renaming and replacing.
  (Mike Galbraith).

* NetworkManager reports "networking disabled" and networking is broken
  after resume 50% of the time (Pavel).  [May be because of systemd.]

* MATE desktop dims the display and starts the screensaver right after
  system resume (Pavel).

* Full system hang during resume (me).  [May be due to systemd or NM or both.]

That happens on debian and open suse systems.

It's sad, that these problems were neither catched in -next nor by those
folks who expressed interest in this change.

Reported-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Reported-by: Genki Sky <sky@genki.is>,
Reported-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Easton <kevin@guarana.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Salyzyn <salyzyn@android.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/time/hrtimer.c       | 16 ++++++++--
 kernel/time/posix-stubs.c   |  2 --
 kernel/time/posix-timers.c  | 26 +++++++++------
 kernel/time/tick-common.c   | 15 ---------
 kernel/time/tick-internal.h |  6 ----
 kernel/time/tick-sched.c    |  9 ------
 kernel/time/timekeeping.c   | 78 +++++++++++++++++++++------------------------
 kernel/time/timekeeping.h   |  1 +
 kernel/trace/trace.c        |  2 +-
 9 files changed, 70 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index eda1210ce50f..14e858753d76 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -90,6 +90,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
 		},
+		{
+			.index = HRTIMER_BASE_BOOTTIME,
+			.clockid = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+		},
 		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
@@ -105,6 +110,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
 		},
+		{
+			.index = HRTIMER_BASE_BOOTTIME_SOFT,
+			.clockid = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+		},
 		{
 			.index = HRTIMER_BASE_TAI_SOFT,
 			.clockid = CLOCK_TAI,
@@ -119,7 +129,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
-	[CLOCK_BOOTTIME]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
@@ -571,12 +581,14 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
 	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
-						   offs_real, offs_tai);
+					    offs_real, offs_boot, offs_tai);
 
 	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
+	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
 	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
 
 	return now;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index e0dbae98db9d..69a937c3cd81 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -83,8 +83,6 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 	case CLOCK_BOOTTIME:
 		get_monotonic_boottime64(tp);
 		break;
-	case CLOCK_MONOTONIC_ACTIVE:
-		ktime_get_active_ts64(tp);
 	default:
 		return -EINVAL;
 	}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b6899b5060bd..10b7186d0638 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -252,16 +252,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
 	return 0;
 }
 
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-	timekeeping_clocktai64(tp);
+	get_monotonic_boottime64(tp);
 	return 0;
 }
 
-static int posix_get_monotonic_active(clockid_t which_clock,
-				      struct timespec64 *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_active_ts64(tp);
+	timekeeping_clocktai64(tp);
 	return 0;
 }
 
@@ -1317,9 +1316,19 @@ static const struct k_clock clock_tai = {
 	.timer_arm		= common_hrtimer_arm,
 };
 
-static const struct k_clock clock_monotonic_active = {
+static const struct k_clock clock_boottime = {
 	.clock_getres		= posix_get_hrtimer_res,
-	.clock_get		= posix_get_monotonic_active,
+	.clock_get		= posix_get_boottime,
+	.nsleep			= common_nsleep,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock * const posix_clocks[] = {
@@ -1330,11 +1339,10 @@ static const struct k_clock * const posix_clocks[] = {
 	[CLOCK_MONOTONIC_RAW]		= &clock_monotonic_raw,
 	[CLOCK_REALTIME_COARSE]		= &clock_realtime_coarse,
 	[CLOCK_MONOTONIC_COARSE]	= &clock_monotonic_coarse,
-	[CLOCK_BOOTTIME]		= &clock_monotonic,
+	[CLOCK_BOOTTIME]		= &clock_boottime,
 	[CLOCK_REALTIME_ALARM]		= &alarm_clock,
 	[CLOCK_BOOTTIME_ALARM]		= &alarm_clock,
 	[CLOCK_TAI]			= &clock_tai,
-	[CLOCK_MONOTONIC_ACTIVE]	= &clock_monotonic_active,
 };
 
 static const struct k_clock *clockid_to_kclock(const clockid_t id)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 099572ca4a8f..49edc1c4f3e6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -419,19 +419,6 @@ void tick_suspend_local(void)
 	clockevents_shutdown(td->evtdev);
 }
 
-static void tick_forward_next_period(void)
-{
-	ktime_t delta, now = ktime_get();
-	u64 n;
-
-	delta = ktime_sub(now, tick_next_period);
-	n = ktime_divns(delta, tick_period);
-	tick_next_period += n * tick_period;
-	if (tick_next_period < now)
-		tick_next_period += tick_period;
-	tick_sched_forward_next_period();
-}
-
 /**
  * tick_resume_local - Resume the local tick device
  *
@@ -444,8 +431,6 @@ void tick_resume_local(void)
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 	bool broadcast = tick_resume_check_broadcast();
 
-	tick_forward_next_period();
-
 	clockevents_tick_resume(td->evtdev);
 	if (!broadcast) {
 		if (td->mode == TICKDEV_MODE_PERIODIC)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 21efab7485ca..e277284c2831 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,12 +141,6 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
 #endif /* !(BROADCAST && ONESHOT) */
 
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-extern void tick_sched_forward_next_period(void);
-#else
-static inline void tick_sched_forward_next_period(void) { }
-#endif
-
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
 extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d31bec177fa5..da9455a6b42b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -51,15 +51,6 @@ struct tick_sched *tick_get_tick_sched(int cpu)
  */
 static ktime_t last_jiffies_update;
 
-/*
- * Called after resume. Make sure that jiffies are not fast forwarded due to
- * clock monotonic being forwarded by the suspended time.
- */
-void tick_sched_forward_next_period(void)
-{
-	last_jiffies_update = tick_next_period;
-}
-
 /*
  * Must be called with interrupts disabled !
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dcf7f20fcd12..49cbceef5deb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,12 +138,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
 
 static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 {
-	/* Update both bases so mono and raw stay coupled. */
-	tk->tkr_mono.base += delta;
-	tk->tkr_raw.base += delta;
-
-	/* Accumulate time spent in suspend */
-	tk->time_suspended += delta;
+	tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
 
 /*
@@ -473,6 +468,36 @@ u64 ktime_get_raw_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ *    CPU 0                                        CPU 1
+ *    timekeeping_inject_sleeptime64()
+ *    __timekeeping_inject_sleeptime(tk, delta);
+ *                                                 timestamp();
+ *    timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated.  Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
+
 /*
  * See comment for __ktime_get_fast_ns() vs. timestamp ordering
  */
@@ -764,6 +789,7 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
 
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
+	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
 	[TK_OFFS_TAI]	= &tk_core.timekeeper.offs_tai,
 };
 
@@ -860,39 +886,6 @@ void ktime_get_ts64(struct timespec64 *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts64);
 
-/**
- * ktime_get_active_ts64 - Get the active non-suspended monotonic clock
- * @ts:		pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime clock and
- * the wall_to_monotonic offset, subtracts the accumulated suspend time and
- * stores the result in normalized timespec64 format in the variable
- * pointed to by @ts.
- */
-void ktime_get_active_ts64(struct timespec64 *ts)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 tomono, tsusp;
-	u64 nsec, nssusp;
-	unsigned int seq;
-
-	WARN_ON(timekeeping_suspended);
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-		ts->tv_sec = tk->xtime_sec;
-		nsec = timekeeping_get_ns(&tk->tkr_mono);
-		tomono = tk->wall_to_monotonic;
-		nssusp = tk->time_suspended;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	ts->tv_sec += tomono.tv_sec;
-	ts->tv_nsec = 0;
-	timespec64_add_ns(ts, nsec + tomono.tv_nsec);
-	tsusp = ns_to_timespec64(nssusp);
-	*ts = timespec64_sub(*ts, tsusp);
-}
-
 /**
  * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
  *
@@ -1593,6 +1586,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 		return;
 	}
 	tk_xtime_add(tk, delta);
+	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
 	tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
 	tk_debug_account_sleep_time(delta);
 }
@@ -2125,7 +2119,7 @@ out:
 void getboottime64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended);
+	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
 
 	*ts = ktime_to_timespec64(t);
 }
@@ -2188,6 +2182,7 @@ void do_timer(unsigned long ticks)
  * ktime_get_update_offsets_now - hrtimer helper
  * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
+ * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
  * Returns current monotonic time and updates the offsets if the
@@ -2197,7 +2192,7 @@ void do_timer(unsigned long ticks)
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
 ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
-				     ktime_t *offs_tai)
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -2214,6 +2209,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 		if (*cwsseq != tk->clock_was_set_seq) {
 			*cwsseq = tk->clock_was_set_seq;
 			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
 			*offs_tai = tk->offs_tai;
 		}
 
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 79b67f5e0343..7a9b4eb7a1d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -6,6 +6,7 @@
  */
 extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
 					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dfbcf9ee1447..414d7210b2ec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1165,7 +1165,7 @@ static struct {
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
-	{ ktime_get_mono_fast_ns,	"boot",		1 },
+	{ ktime_get_boot_fast_ns,	"boot",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
-- 
cgit v1.2.3


From 0c92c7a3c5d416f47b32c5f20a611dfeca5d5f2e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Apr 2018 10:21:34 -0700
Subject: tracing: Fix bad use of igrab in trace_uprobe.c

As Miklos reported and suggested:

  This pattern repeats two times in trace_uprobe.c and in
  kernel/events/core.c as well:

      ret = kern_path(filename, LOOKUP_FOLLOW, &path);
      if (ret)
          goto fail_address_parse;

      inode = igrab(d_inode(path.dentry));
      path_put(&path);

  And it's wrong.  You can only hold a reference to the inode if you
  have an active ref to the superblock as well (which is normally
  through path.mnt) or holding s_umount.

  This way unmounting the containing filesystem while the tracepoint is
  active will give you the "VFS: Busy inodes after unmount..." message
  and a crash when the inode is finally put.

  Solution: store path instead of inode.

This patch fixes two instances in trace_uprobe.c. struct path is added to
struct trace_uprobe to keep the inode and containing mount point
referenced.

Link: http://lkml.kernel.org/r/20180423172135.4050588-1-songliubraving@fb.com

Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes")
Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Howard McLauchlan <hmclauchlan@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 34fd0e0ec51d..ac892878dbe6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,6 +55,7 @@ struct trace_uprobe {
 	struct list_head		list;
 	struct trace_uprobe_filter	filter;
 	struct uprobe_consumer		consumer;
+	struct path			path;
 	struct inode			*inode;
 	char				*filename;
 	unsigned long			offset;
@@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
 	for (i = 0; i < tu->tp.nr_args; i++)
 		traceprobe_free_probe_arg(&tu->tp.args[i]);
 
-	iput(tu->inode);
+	path_put(&tu->path);
 	kfree(tu->tp.call.class->system);
 	kfree(tu->tp.call.name);
 	kfree(tu->filename);
@@ -363,7 +364,6 @@ end:
 static int create_trace_uprobe(int argc, char **argv)
 {
 	struct trace_uprobe *tu;
-	struct inode *inode;
 	char *arg, *event, *group, *filename;
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
@@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv)
 	bool is_delete, is_return;
 	int i, ret;
 
-	inode = NULL;
 	ret = 0;
 	is_delete = false;
 	is_return = false;
@@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv)
 	}
 	/* Find the last occurrence, in case the path contains ':' too. */
 	arg = strrchr(argv[1], ':');
-	if (!arg) {
-		ret = -EINVAL;
-		goto fail_address_parse;
-	}
+	if (!arg)
+		return -EINVAL;
 
 	*arg++ = '\0';
 	filename = argv[1];
 	ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 	if (ret)
-		goto fail_address_parse;
-
-	inode = igrab(d_real_inode(path.dentry));
-	path_put(&path);
+		return ret;
 
-	if (!inode || !S_ISREG(inode->i_mode)) {
+	if (!d_is_reg(path.dentry)) {
 		ret = -EINVAL;
 		goto fail_address_parse;
 	}
@@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv)
 		goto fail_address_parse;
 	}
 	tu->offset = offset;
-	tu->inode = inode;
+	tu->path = path;
 	tu->filename = kstrdup(filename, GFP_KERNEL);
 
 	if (!tu->filename) {
@@ -558,7 +552,7 @@ error:
 	return ret;
 
 fail_address_parse:
-	iput(inode);
+	path_put(&path);
 
 	pr_info("Failed to parse address or file.\n");
 
@@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
 		goto err_flags;
 
 	tu->consumer.filter = filter;
+	tu->inode = d_real_inode(tu->path.dentry);
 	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
 	if (ret)
 		goto err_buffer;
@@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
 	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+	tu->inode = NULL;
 	tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
 
 	uprobe_buffer_disable();
@@ -1337,7 +1333,6 @@ struct trace_event_call *
 create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 {
 	struct trace_uprobe *tu;
-	struct inode *inode;
 	struct path path;
 	int ret;
 
@@ -1345,11 +1340,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 	if (ret)
 		return ERR_PTR(ret);
 
-	inode = igrab(d_inode(path.dentry));
-	path_put(&path);
-
-	if (!inode || !S_ISREG(inode->i_mode)) {
-		iput(inode);
+	if (!d_is_reg(path.dentry)) {
+		path_put(&path);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1364,11 +1356,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 	if (IS_ERR(tu)) {
 		pr_info("Failed to allocate trace_uprobe.(%d)\n",
 			(int)PTR_ERR(tu));
+		path_put(&path);
 		return ERR_CAST(tu);
 	}
 
 	tu->offset = offs;
-	tu->inode = inode;
+	tu->path = path;
 	tu->filename = kstrdup(name, GFP_KERNEL);
 	init_trace_event_call(tu, &tu->tp.call);
 
-- 
cgit v1.2.3


From 61f94203c9efcaf44a7435298697caf406476c79 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 23 Apr 2018 10:21:35 -0700
Subject: tracing: Remove igrab() iput() call from uprobes.c

Caller of uprobe_register is required to keep the inode and containing
mount point referenced.

There was misuse of igrab() in uprobes.c and trace_uprobe.c. This is
because igrab() will not prevent umount of the containing mount point.
To fix this, we added path to struct trace_uprobe, which keeps the inode
and containing mount reference.

For uprobes.c, it is not necessary to call igrab() in uprobe_register(),
as the caller is required to keep the inode reference. The igrab() is
removed and comments on this requirement is added to uprobe_register().

Link: http://lkml.kernel.org/r/CAELBmZB2XX=qEOLAdvGG4cPx4GEntcSnWQquJLUK1ongRj35cA@mail.gmail.com
Link: http://lkml.kernel.org/r/20180423172135.4050588-2-songliubraving@fb.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Howard McLauchlan <hmclauchlan@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/events/uprobes.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ce6848e46e94..1725b902983f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	if (!uprobe)
 		return NULL;
 
-	uprobe->inode = igrab(inode);
+	uprobe->inode = inode;
 	uprobe->offset = offset;
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
@@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	if (cur_uprobe) {
 		kfree(uprobe);
 		uprobe = cur_uprobe;
-		iput(inode);
 	}
 
 	return uprobe;
@@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe)
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock(&uprobes_treelock);
 	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
-	iput(uprobe->inode);
 	put_uprobe(uprobe);
 }
 
@@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
  * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
  * refcount is released when the last @uc for the @uprobe
- * unregisters.
+ * unregisters. Caller of uprobe_register() is required to keep @inode
+ * (and the containing mount) referenced.
  *
  * Return errno if it cannot successully install probes
  * else return 0 (success)
-- 
cgit v1.2.3


From 608940dabe1bd2ce4c97524004ec86637cf80f2c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 26 Apr 2018 20:04:47 -0500
Subject: tracing: Restore proper field flag printing when displaying triggers

The flag-printing code used when displaying hist triggers somehow got
dropped during refactoring of the inter-event patchset.  This restores
it.

Below are a couple examples - in the first case, .usecs wasn't being
displayed properly for common_timestamps and the second illustrates
the same for other flags such as .execname.

Before:

  # echo 'hist:key=common_pid.execname:val=count:sort=count' > /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
  # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
  hist:keys=common_pid:vals=hitcount,count:sort=count:size=2048 [active]

  # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  hist:keys=pid:vals=hitcount:ts0=common_timestamp:sort=hitcount:size=2048:clock=global if comm=="cyclictest" [active]

After:

  # echo 'hist:key=common_pid.execname:val=count:sort=count' > /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
  # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
  hist:keys=common_pid.execname:vals=hitcount,count:sort=count:size=2048 [active]

  # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global if comm=="cyclictest" [active]

Link: http://lkml.kernel.org/r/492bab42ff21806600af98a8ea901af10efbee0c.1524790601.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0d7b3ffbecc2..66c87be4ebb2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4913,6 +4913,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 		seq_printf(m, "%s", field_name);
 	} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
 		seq_puts(m, "common_timestamp");
+
+	if (hist_field->flags) {
+		if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
+		    !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+			const char *flags = get_hist_field_flags(hist_field);
+
+			if (flags)
+				seq_printf(m, ".%s", flags);
+		}
+	}
 }
 
 static int event_hist_trigger_print(struct seq_file *m,
-- 
cgit v1.2.3


From 5ec432d7bf9dd3b4a2b84f8974e3adb71f45fb1d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 26 Apr 2018 20:04:48 -0500
Subject: tracing: Add field parsing hist error for hist triggers

If the user specifies a nonexistent field for a hist trigger, the
current code correctly flags that as an error, but doesn't tell the
user what happened.

Fix this by invoking hist_err() with an appropriate message when
nonexistent fields are specified.

Before:

  # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
  -su: echo: write error: Invalid argument
  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist

After:

  # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
  -su: echo: write error: Invalid argument
  # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
  ERROR: Couldn't find field: pid
    Last command: keys=pid:ts0=common_timestamp.usecs

Link: http://lkml.kernel.org/r/fdc8746969d16906120f162b99dd71c741e0b62c.1524790601.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Reported-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 66c87be4ebb2..f231fa2a3dcd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2481,6 +2481,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 	else {
 		field = trace_find_event_field(file->event_call, field_name);
 		if (!field || !field->size) {
+			hist_err("Couldn't find field: ", field_name);
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
-- 
cgit v1.2.3


From dcf234577cd31fa16874e828b90659166ad6b80d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 26 Apr 2018 20:04:49 -0500
Subject: tracing: Add field modifier parsing hist error for hist triggers

If the user specifies an invalid field modifier for a hist trigger,
the current code correctly flags that as an error, but doesn't tell
the user what happened.

Fix this by invoking hist_err() with an appropriate message when
invalid modifiers are specified.

Before:

  # echo 'hist:keys=pid:ts0=common_timestamp.junkusecs' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  -su: echo: write error: Invalid argument
  # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist

After:

  # echo 'hist:keys=pid:ts0=common_timestamp.junkusecs' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
  -su: echo: write error: Invalid argument
  # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
  ERROR: Invalid field modifier: junkusecs
    Last command: keys=pid:ts0=common_timestamp.junkusecs

Link: http://lkml.kernel.org/r/b043c59fa79acd06a5f14a1d44dee9e5a3cd1248.1524790601.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f231fa2a3dcd..b9061ed59bbd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 		else if (strcmp(modifier, "usecs") == 0)
 			*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
 		else {
+			hist_err("Invalid field modifier: ", modifier);
 			field = ERR_PTR(-EINVAL);
 			goto out;
 		}
-- 
cgit v1.2.3


From d66a270be3310d7aa132fec0cea77d3d32a0ff75 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 15 Mar 2018 08:44:24 -0400
Subject: tracepoint: Do not warn on ENOMEM

Tracepoint should only warn when a kernel API user does not respect the
required preconditions (e.g. same tracepoint enabled twice, or called
to remove a tracepoint that does not exist).

Silence warning in out-of-memory conditions, given that the error is
returned to the caller.

This ensures that out-of-memory error-injection testing does not trigger
warnings in tracepoint.c, which were seen by syzbot.

Link: https://lkml.kernel.org/r/001a114465e241a8720567419a72@google.com
Link: https://lkml.kernel.org/r/001a1140e0de15fc910567464190@google.com
Link: http://lkml.kernel.org/r/20180315124424.32319-1-mathieu.desnoyers@efficios.com

CC: Peter Zijlstra <peterz@infradead.org>
CC: Jiri Olsa <jolsa@redhat.com>
CC: Arnaldo Carvalho de Melo <acme@kernel.org>
CC: Alexander Shishkin <alexander.shishkin@linux.intel.com>
CC: Namhyung Kim <namhyung@kernel.org>
CC: stable@vger.kernel.org
Fixes: de7b2973903c6 ("tracepoint: Use struct pointer instead of name hash for reg/unreg tracepoints")
Reported-by: syzbot+9c0d616860575a73166a@syzkaller.appspotmail.com
Reported-by: syzbot+4e9ae7fa46233396f64d@syzkaller.appspotmail.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 671b13457387..1e37da2e0c25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_add(&tp_funcs, func, prio);
 	if (IS_ERR(old)) {
-		WARN_ON_ONCE(1);
+		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
 	}
 
@@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_remove(&tp_funcs, func);
 	if (IS_ERR(old)) {
-		WARN_ON_ONCE(1);
+		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
 	}
 
-- 
cgit v1.2.3


From 2aae7bcfa4104b770e6f612356adb8d66c6144d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Apr 2018 17:28:55 +0200
Subject: clocksource: Allow clocksource_mark_unstable() on unregistered
 clocksources

Because of how the code flips between tsc-early and tsc clocksources
it might need to mark one or both unstable. The current code in
mark_tsc_unstable() only worked because previously it registered the
tsc clocksource once and then never touched it.

Since it now unregisters the tsc-early clocksource, it needs to know
if a clocksource got unregistered and the current cs->mult test
doesn't work for that. Instead use list_empty(&cs->list) to test for
registration.

Furthermore, since clocksource_mark_unstable() needs to place the cs
on the wd_list, it links the cs->list and cs->wd_list serialization.
It must not see a clocsource registered (!empty cs->list) but already
past dequeue_watchdog(). So place {en,de}queue{,_watchdog}() under the
same lock.

Provided cs->list is initialized to empty, this then allows us to
unconditionally use clocksource_mark_unstable(), regardless of the
registration state.

Fixes: aa83c45762a2 ("x86/tsc: Introduce early tsc clocksource")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Diego Viola <diego.viola@gmail.com>
Cc: len.brown@intel.com
Cc: rjw@rjwysocki.net
Cc: diego.viola@gmail.com
Cc: rui.zhang@intel.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180502135312.GS12217@hirez.programming.kicks-ass.net
---
 kernel/time/clocksource.c | 50 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0e974cface0b..c3d2b94723dc 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -119,6 +119,16 @@ static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
 static atomic_t watchdog_reset_pending;
 
+static void inline clocksource_watchdog_lock(unsigned long *flags)
+{
+	spin_lock_irqsave(&watchdog_lock, *flags);
+}
+
+static void inline clocksource_watchdog_unlock(unsigned long *flags)
+{
+	spin_unlock_irqrestore(&watchdog_lock, *flags);
+}
+
 static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
 
@@ -142,6 +152,9 @@ static void __clocksource_unstable(struct clocksource *cs)
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
+	if (list_empty(&cs->list))
+		return;
+
 	if (cs->mark_unstable)
 		cs->mark_unstable(cs);
 
@@ -164,7 +177,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 
 	spin_lock_irqsave(&watchdog_lock, flags);
 	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
-		if (list_empty(&cs->wd_list))
+		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
 			list_add(&cs->wd_list, &watchdog_list);
 		__clocksource_unstable(cs);
 	}
@@ -319,9 +332,6 @@ static void clocksource_resume_watchdog(void)
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&watchdog_lock, flags);
 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
 		/* cs is a clocksource to be watched. */
 		list_add(&cs->wd_list, &watchdog_list);
@@ -331,7 +341,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 	}
-	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
 static void clocksource_select_watchdog(bool fallback)
@@ -373,9 +382,6 @@ static void clocksource_select_watchdog(bool fallback)
 
 static void clocksource_dequeue_watchdog(struct clocksource *cs)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&watchdog_lock, flags);
 	if (cs != watchdog) {
 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
 			/* cs is a watched clocksource. */
@@ -384,21 +390,19 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
 			clocksource_stop_watchdog();
 		}
 	}
-	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
 static int __clocksource_watchdog_kthread(void)
 {
 	struct clocksource *cs, *tmp;
 	unsigned long flags;
-	LIST_HEAD(unstable);
 	int select = 0;
 
 	spin_lock_irqsave(&watchdog_lock, flags);
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
 			list_del_init(&cs->wd_list);
-			list_add(&cs->wd_list, &unstable);
+			__clocksource_change_rating(cs, 0);
 			select = 1;
 		}
 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -410,11 +414,6 @@ static int __clocksource_watchdog_kthread(void)
 	clocksource_stop_watchdog();
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 
-	/* Needs to be done outside of watchdog lock */
-	list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
-		list_del_init(&cs->wd_list);
-		__clocksource_change_rating(cs, 0);
-	}
 	return select;
 }
 
@@ -447,6 +446,9 @@ static inline int __clocksource_watchdog_kthread(void) { return 0; }
 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
 void clocksource_mark_unstable(struct clocksource *cs) { }
 
+static void inline clocksource_watchdog_lock(unsigned long *flags) { }
+static void inline clocksource_watchdog_unlock(unsigned long *flags) { }
+
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 
 /**
@@ -779,14 +781,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
  */
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
+	unsigned long flags;
 
 	/* Initialize mult/shift and max_idle_ns */
 	__clocksource_update_freq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clocksource list */
 	mutex_lock(&clocksource_mutex);
+
+	clocksource_watchdog_lock(&flags);
 	clocksource_enqueue(cs);
 	clocksource_enqueue_watchdog(cs);
+	clocksource_watchdog_unlock(&flags);
+
 	clocksource_select();
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
@@ -808,8 +815,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
  */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
+	unsigned long flags;
+
 	mutex_lock(&clocksource_mutex);
+	clocksource_watchdog_lock(&flags);
 	__clocksource_change_rating(cs, rating);
+	clocksource_watchdog_unlock(&flags);
+
 	clocksource_select();
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
@@ -821,6 +833,8 @@ EXPORT_SYMBOL(clocksource_change_rating);
  */
 static int clocksource_unbind(struct clocksource *cs)
 {
+	unsigned long flags;
+
 	if (clocksource_is_watchdog(cs)) {
 		/* Select and try to install a replacement watchdog. */
 		clocksource_select_watchdog(true);
@@ -834,8 +848,12 @@ static int clocksource_unbind(struct clocksource *cs)
 		if (curr_clocksource == cs)
 			return -EBUSY;
 	}
+
+	clocksource_watchdog_lock(&flags);
 	clocksource_dequeue_watchdog(cs);
 	list_del_init(&cs->list);
+	clocksource_watchdog_unlock(&flags);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5b9e886a4af97574ca3ce1147f35545da0e7afc7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Apr 2018 12:00:11 +0200
Subject: clocksource: Initialize cs->wd_list

A number of places relies on list_empty(&cs->wd_list), however the
list_head does not get initialized. Do so upon registration, such that
thereafter it is possible to rely on list_empty() correctly reflecting
the list membership status.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Diego Viola <diego.viola@gmail.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: stable@vger.kernel.org
Cc: len.brown@intel.com
Cc: rjw@rjwysocki.net
Cc: rui.zhang@intel.com
Link: https://lkml.kernel.org/r/20180430100344.472662715@infradead.org
---
 kernel/time/clocksource.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c3d2b94723dc..935f39eb8aac 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -332,6 +332,8 @@ static void clocksource_resume_watchdog(void)
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
+	INIT_LIST_HEAD(&cs->wd_list);
+
 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
 		/* cs is a clocksource to be watched. */
 		list_add(&cs->wd_list, &watchdog_list);
-- 
cgit v1.2.3


From cd2af07d823e5287cd6c91d54337348c2a873462 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Apr 2018 12:00:13 +0200
Subject: clocksource: Consistent de-rate when marking unstable

When a registered clocksource gets marked unstable the watchdog_kthread
will de-rate and re-select the clocksource. Ensure it also de-rates
when getting called on an unregistered clocksource.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: len.brown@intel.com
Cc: rjw@rjwysocki.net
Cc: diego.viola@gmail.com
Cc: rui.zhang@intel.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180430100344.594904898@infradead.org
---
 kernel/time/clocksource.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 935f39eb8aac..605656df16c4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -152,12 +152,19 @@ static void __clocksource_unstable(struct clocksource *cs)
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
-	if (list_empty(&cs->list))
+	/*
+	 * If the clocksource is registered clocksource_watchdog_kthread() will
+	 * re-rate and re-select.
+	 */
+	if (list_empty(&cs->list)) {
+		cs->rating = 0;
 		return;
+	}
 
 	if (cs->mark_unstable)
 		cs->mark_unstable(cs);
 
+	/* kick clocksource_watchdog_kthread() */
 	if (finished_booting)
 		schedule_work(&watchdog_work);
 }
-- 
cgit v1.2.3


From 7dba33c6346c337aac3f7cd188137d4a6d3d1f3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Apr 2018 12:00:14 +0200
Subject: clocksource: Rework stale comment

AFAICS the hotplug code no longer uses this function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: len.brown@intel.com
Cc: rjw@rjwysocki.net
Cc: diego.viola@gmail.com
Cc: rui.zhang@intel.com
Link: https://lkml.kernel.org/r/20180430100344.656525644@infradead.org
---
 kernel/time/clocksource.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 605656df16c4..84f37420fcf5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -173,10 +173,8 @@ static void __clocksource_unstable(struct clocksource *cs)
  * clocksource_mark_unstable - mark clocksource unstable via watchdog
  * @cs:		clocksource to be marked unstable
  *
- * This function is called instead of clocksource_change_rating from
- * cpu hotplug code to avoid a deadlock between the clocksource mutex
- * and the cpu hotplug mutex. It defers the update of the clocksource
- * to the watchdog thread.
+ * This function is called by the x86 TSC code to mark clocksources as unstable;
+ * it defers demotion and re-selection to a kthread.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
-- 
cgit v1.2.3


From 3cc9a472d625f31f981063882b07e96229b9e71a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:19 -0700
Subject: bpf: sockmap, fix scatterlist update on error path in send with apply

When the call to do_tcp_sendpage() fails to send the complete block
requested we either retry if only a partial send was completed or
abort if we receive a error less than or equal to zero. Before
returning though we must update the scatterlist length/offset to
account for any partial send completed.

Before this patch we did this at the end of the retry loop, but
this was buggy when used while applying a verdict to fewer bytes
than in the scatterlist. When the scatterlist length was being set
we forgot to account for the apply logic reducing the size variable.
So the result was we chopped off some bytes in the scatterlist without
doing proper cleanup on them. This results in a WARNING when the
sock is tore down because the bytes have previously been charged to
the socket but are never uncharged.

The simple fix is to simply do the accounting inside the retry loop
subtracting from the absolute scatterlist values rather than trying
to accumulate the totals and subtract at the end.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 634415c7fbcd..943929a05c92 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -326,6 +326,9 @@ retry:
 			if (ret > 0) {
 				if (apply)
 					apply_bytes -= ret;
+
+				sg->offset += ret;
+				sg->length -= ret;
 				size -= ret;
 				offset += ret;
 				if (uncharge)
@@ -333,8 +336,6 @@ retry:
 				goto retry;
 			}
 
-			sg->length = size;
-			sg->offset = offset;
 			return ret;
 		}
 
-- 
cgit v1.2.3


From fec51d40ea65dd8f51a3e27fc69b4e7dc4f17776 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:24 -0700
Subject: bpf: sockmap, zero sg_size on error when buffer is released

When an error occurs during a redirect we have two cases that need
to be handled (i) we have a cork'ed buffer (ii) we have a normal
sendmsg buffer.

In the cork'ed buffer case we don't currently support recovering from
errors in a redirect action. So the buffer is released and the error
should _not_ be pushed back to the caller of sendmsg/sendpage. The
rationale here is the user will get an error that relates to old
data that may have been sent by some arbitrary thread on that sock.
Instead we simple consume the data and tell the user that the data
has been consumed. We may add proper error recovery in the future.
However, this patch fixes a bug where the bytes outstanding counter
sg_size was not zeroed. This could result in a case where if the user
has both a cork'ed action and apply action in progress we may
incorrectly call into the BPF program when the user expected an
old verdict to be applied via the apply action. I don't have a use
case where using apply and cork at the same time is valid but we
never explicitly reject it because it should work fine. This patch
ensures the sg_size is zeroed so we don't have this case.

In the normal sendmsg buffer case (no cork data) we also do not
zero sg_size. Again this can confuse the apply logic when the logic
calls into the BPF program when the BPF programmer expected the old
verdict to remain. So ensure we set sg_size to zero here as well. And
additionally to keep the psock state in-sync with the sk_msg_buff
release all the memory as well. Previously we did this before
returning to the user but this left a gap where psock and sk_msg_buff
states were out of sync which seems fragile. No additional overhead
is taken here except for a call to check the length and realize its
already been freed. This is in the error path as well so in my
opinion lets have robust code over optimized error paths.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 943929a05c92..052c313b12db 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -701,15 +701,22 @@ more_data:
 		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
 		lock_sock(sk);
 
+		if (unlikely(err < 0)) {
+			free_start_sg(sk, m);
+			psock->sg_size = 0;
+			if (!cork)
+				*copied -= send;
+		} else {
+			psock->sg_size -= send;
+		}
+
 		if (cork) {
 			free_start_sg(sk, m);
+			psock->sg_size = 0;
 			kfree(m);
 			m = NULL;
+			err = 0;
 		}
-		if (unlikely(err))
-			*copied -= err;
-		else
-			psock->sg_size -= send;
 		break;
 	case __SK_DROP:
 	default:
-- 
cgit v1.2.3


From abaeb096ca38cad02c8a68c49ddd7efc043c319a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:29 -0700
Subject: bpf: sockmap, fix error handling in redirect failures

When a redirect failure happens we release the buffers in-flight
without calling a sk_mem_uncharge(), the uncharge is called before
dropping the sock lock for the redirecte, however we missed updating
the ring start index. When no apply actions are in progress this
is OK because we uncharge the entire buffer before the redirect.
But, when we have apply logic running its possible that only a
portion of the buffer is being redirected. In this case we only
do memory accounting for the buffer slice being redirected and
expect to be able to loop over the BPF program again and/or if
a sock is closed uncharge the memory at sock destruct time.

With an invalid start index however the program logic looks at
the start pointer index, checks the length, and when seeing the
length is zero (from the initial release and failure to update
the pointer) aborts without uncharging/releasing the remaining
memory.

The fix for this is simply to update the start index. To avoid
fixing this error in two locations we do a small refactor and
remove one case where it is open-coded. Then fix it in the
single function.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 052c313b12db..098eca568c2b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 	} while (i != md->sg_end);
 }
 
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+			  struct sk_msg_buff *md, bool charge)
 {
 	struct scatterlist *sg = md->sg_data;
 	int i = md->sg_start, free;
@@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (bytes < free) {
 			sg[i].length -= bytes;
 			sg[i].offset += bytes;
-			sk_mem_uncharge(sk, bytes);
+			if (charge)
+				sk_mem_uncharge(sk, bytes);
 			break;
 		}
 
-		sk_mem_uncharge(sk, sg[i].length);
+		if (charge)
+			sk_mem_uncharge(sk, sg[i].length);
 		put_page(sg_page(&sg[i]));
 		bytes -= sg[i].length;
 		sg[i].length = 0;
@@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
 	}
+	md->sg_start = i;
 }
 
 static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 				       struct sk_msg_buff *md,
 				       int flags)
 {
+	bool ingress = !!(md->flags & BPF_F_INGRESS);
 	struct smap_psock *psock;
 	struct scatterlist *sg;
-	int i, err, free = 0;
-	bool ingress = !!(md->flags & BPF_F_INGRESS);
+	int err = 0;
 
 	sg = md->sg_data;
 
@@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 out_rcu:
 	rcu_read_unlock();
 out:
-	i = md->sg_start;
-	while (sg[i].length) {
-		free += sg[i].length;
-		put_page(sg_page(&sg[i]));
-		sg[i].length = 0;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	return free;
+	free_bytes_sg(NULL, send, md, false);
+	return err;
 }
 
 static inline void bpf_md_init(struct smap_psock *psock)
@@ -720,7 +716,7 @@ more_data:
 		break;
 	case __SK_DROP:
 	default:
-		free_bytes_sg(sk, send, m);
+		free_bytes_sg(sk, send, m, true);
 		apply_bytes_dec(psock, send);
 		*copied -= send;
 		psock->sg_size -= send;
-- 
cgit v1.2.3


From 1ce0500d234f8ef880c399d55a886af646beec9a Mon Sep 17 00:00:00 2001
From: Chen LinX <linx.z.chen@intel.com>
Date: Wed, 3 Sep 2014 14:31:09 +0800
Subject: ftrace: Have set_graph_* files have normal file modes

The set_graph_function and set_graph_notrace file mode should be 0644
instead of 0444 as they are writeable. Note, the mode appears to be ignored
regardless, but they should at least look sane.

Link: http://lkml.kernel.org/r/1409725869-4501-1-git-send-email-linx.z.chen@intel.com

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Chen LinX <linx.z.chen@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 16bbf062018f..8d83bcf9ef69 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5514,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 	ftrace_create_filter_files(&global_ops, d_tracer);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	trace_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0644, d_tracer,
 				    NULL,
 				    &ftrace_graph_fops);
-	trace_create_file("set_graph_notrace", 0444, d_tracer,
+	trace_create_file("set_graph_notrace", 0644, d_tracer,
 				    NULL,
 				    &ftrace_graph_notrace_fops);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v1.2.3


From 0c5a9acc8b4e878e761f735e144d4a7e4477d4e6 Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Thu, 8 Feb 2018 09:41:53 +0800
Subject: tracing: Fix the file mode of stack tracer

It looks weird that the stack_trace_filter file can be written by root
but shows that it does not have write permission by ll command.

Link: http://lkml.kernel.org/r/1518054113-28096-1-git-send-email-liuzhengyuan@kylinos.cn

Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3c7bfc4bf5e9..4237eba4ef20 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -472,7 +472,7 @@ static __init int stack_trace_init(void)
 			NULL, &stack_trace_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-	trace_create_file("stack_trace_filter", 0444, d_tracer,
+	trace_create_file("stack_trace_filter", 0644, d_tracer,
 			  &trace_ops, &stack_trace_filter_fops);
 #endif
 
-- 
cgit v1.2.3