From c269cba35b061181bc23c470809c00e8f71e535a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 22 Feb 2016 15:02:07 +0100
Subject: memremap: add arch specific hook for MEMREMAP_WB mappings

Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.

So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.

Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 kernel/memremap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a6d382312e6f..017532193fb1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 }
 #endif
 
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+	return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
 	unsigned long pfn = PHYS_PFN(offset);
@@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
 	/* In the simple case just return the existing linear address */
 	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
 		return __va(offset);
-	return NULL; /* fallback to ioremap_cache */
+	return NULL; /* fallback to arch_memremap_wb */
 }
 
 /**
@@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		if (is_ram == REGION_INTERSECTS)
 			addr = try_ram_remap(offset, size);
 		if (!addr)
-			addr = ioremap_cache(offset, size);
+			addr = arch_memremap_wb(offset, size);
 	}
 
 	/*
-- 
cgit v1.2.3


From 0bf676d1fd5144e66ac06939fa3c7c12086c3b18 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:49:28 +0200
Subject: audit: cleanup prune_tree_thread

We can use kthread_run instead of kthread_create+wake_up_process for
creating the thread.

We do not need to set the task state to TASK_RUNNING after schedule(),
the process is in that state already.

And we do not need to set the state to TASK_INTERRUPTIBLE when not
doing schedule() as we set the state to TASK_RUNNING immediately
afterwards.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: <linux-audit@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5efe9b299a12..25772476fa4a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
 static int prune_tree_thread(void *unused)
 {
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (list_empty(&prune_list))
+		if (list_empty(&prune_list)) {
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
-		__set_current_state(TASK_RUNNING);
+		}
 
 		mutex_lock(&audit_cmd_mutex);
 		mutex_lock(&audit_filter_mutex);
@@ -693,16 +693,14 @@ static int audit_launch_prune(void)
 {
 	if (prune_thread)
 		return 0;
-	prune_thread = kthread_create(prune_tree_thread, NULL,
+	prune_thread = kthread_run(prune_tree_thread, NULL,
 				"audit_prune_tree");
 	if (IS_ERR(prune_thread)) {
 		pr_err("cannot start thread audit_prune_tree");
 		prune_thread = NULL;
 		return -ENOMEM;
-	} else {
-		wake_up_process(prune_thread);
-		return 0;
 	}
+	return 0;
 }
 
 /* called with audit_filter_mutex */
-- 
cgit v1.2.3


From 7ffb8e317bae03b8ee5bdcec93dc3723be945e9b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 4 Apr 2016 16:44:02 -0400
Subject: audit: we don't need to __set_current_state(TASK_RUNNING)

Remove the calls to __set_current_state() to mark the task as running
and do some related cleanup in wait_for_auditd() to limit the amount
of work we do when we aren't going to reschedule the current task.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 3a3e5deeda8d..f52fbefede09 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -430,7 +430,6 @@ restart:
 					attempts, audit_pid);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule();
-				__set_current_state(TASK_RUNNING);
 				goto restart;
 			}
 		}
@@ -1324,15 +1323,14 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 static long wait_for_auditd(long sleep_time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&audit_backlog_wait, &wait);
 
 	if (audit_backlog_limit &&
-	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+		add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		sleep_time = schedule_timeout(sleep_time);
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&audit_backlog_wait, &wait);
+		remove_wait_queue(&audit_backlog_wait, &wait);
+	}
 
 	return sleep_time;
 }
-- 
cgit v1.2.3


From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Generalise system_verify_data() to provide access to internal
 content

Generalise system_verify_data() to provide access to internal content
through a callback.  This allows all the PKCS#7 stuff to be hidden inside
this function and removed from the PE file parser and the PKCS#7 test key.

If external content is not required, NULL should be passed as data to the
function.  If the callback is not required, that can be set to NULL.

The function is now called verify_pkcs7_signature() to contrast with
verify_pefile_signature() and the definitions of both have been moved into
linux/verification.h along with the key_being_used_for enum.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..593aace88a02 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len,
-				  VERIFYING_MODULE_SIGNATURE);
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
 }
-- 
cgit v1.2.3


From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: PKCS#7: Make trust determination dependent on contents of trust
 keyring

Make the determination of the trustworthiness of a key dependent on whether
a key that can verify it is present in the supplied ring of trusted keys
rather than whether or not the verifying key has KEY_FLAG_TRUSTED set.

verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust
chain cannot be verified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 593aace88a02..6a64e03b9f44 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit v1.2.3


From a511e1af8b12f44c6e55786c463c9f093c214fb6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:26 +0100
Subject: KEYS: Move the point of trust determination to __key_link()

Move the point at which a key is determined to be trustworthy to
__key_link() so that we use the contents of the keyring being linked in to
to determine whether the key being linked in is trusted or not.

What is 'trusted' then becomes a matter of what's in the keyring.

Currently, the test is done when the key is parsed, but given that at that
point we can only sensibly refer to the contents of the system trusted
keyring, we can only use that as the basis for working out the
trustworthiness of a new key.

With this change, a trusted keyring is a set of keys that once the
trusted-only flag is set cannot be added to except by verification through
one of the contained keys.

Further, adding a key into a trusted keyring, whilst it might grant
trustworthiness in the context of that keyring, does not automatically
grant trustworthiness in the context of a second keyring to which it could
be secondarily linked.

To accomplish this, the authentication data associated with the key source
must now be retained.  For an X.509 cert, this means the contents of the
AuthorityKeyIdentifier and the signature data.


If system keyrings are disabled then restrict_link_by_builtin_trusted()
resolves to restrict_link_reject().  The integrity digital signature code
still works correctly with this as it was previously using
KEY_FLAG_TRUSTED_ONLY, which doesn't permit anything to be added if there
is no system keyring against which trust can be determined.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/module_signing.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6a64e03b9f44..937c844bee4a 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
 
-- 
cgit v1.2.3


From 9ebc57cfaad21aacbc363eecead579269a46b493 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 12 Apr 2016 20:39:55 -0400
Subject: tracing: Rename check_ignore_pid() to ignore_this_task()

The name "check_ignore_pid" is confusing in trying to figure out if the pid
should be ignored or not. Rename it to "ignore_this_task" which is pretty
straight forward, as a task (not a pid) is passed in, and should if true
should be ignored.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..598a18675a6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -484,7 +484,7 @@ static int cmp_pid(const void *key, const void *elt)
 }
 
 static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
 	pid_t search_pid;
 	pid_t *pid;
@@ -517,8 +517,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, prev) &&
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, prev) &&
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -531,7 +531,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -547,7 +547,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, task));
+		       ignore_this_task(pid_list, task));
 }
 
 static void
@@ -564,7 +564,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1561,7 +1561,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static ssize_t
-- 
cgit v1.2.3


From f4d34a87e9c10f0ffd03d3548db6bfb200d06cdf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:27:49 -0400
Subject: tracing: Use pid bitmap instead of a pid array for set_event_pid

In order to add the ability to let tasks that are filtered by the events
have their children also be traced on fork (and then not traced on exit),
convert the array into a pid bitmask. Most of the time the number of pids is
only 32768 pids or a 4k bitmask, which is the same size as the default list
currently is, and that list could grow if more pids are listed.

This also greatly simplifies the code.

Suggested-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |   5 +-
 kernel/trace/trace_events.c | 221 ++++++++++++++++++++------------------------
 2 files changed, 102 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..68cbb8e10aea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,9 +177,8 @@ struct trace_options {
 };
 
 struct trace_pid_list {
-	unsigned int			nr_pids;
-	int				order;
-	pid_t				*pids;
+	int				pid_max;
+	unsigned long			*pids;
 };
 
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 598a18675a6b..45f7cc72bf25 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
 	mutex_unlock(&event_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-	const pid_t *search_pid = key;
-	const pid_t *pid = elt;
-
-	if (*search_pid == *pid)
-		return 0;
-	if (*search_pid < *pid)
-		return -1;
-	return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
 ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
-	pid_t search_pid;
-	pid_t *pid;
+	pid_t pid;
 
 	/*
 	 * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	search_pid = task->pid;
+	pid = task->pid;
 
-	pid = bsearch(&search_pid, filtered_pids->pids,
-		      filtered_pids->nr_pids, sizeof(pid_t),
-		      cmp_pid);
-	if (!pid)
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (task->pid >= filtered_pids->pid_max)
 		return true;
 
-	return false;
+	return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_sched();
 
-	free_pages((unsigned long)pid_list->pids, pid_list->order);
+	vfree(pid_list->pids);
 	kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+	unsigned long pid = (unsigned long)v;
+
+	(*pos)++;
+
+	/* pid already is +1 of the actual prevous bit */
+	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+	/* Return pid + 1 to allow zero to be represented */
+	if (pid < pid_list->pid_max)
+		return (void *)(pid + 1);
+
+	return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
 	__acquires(RCU)
 {
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
+	unsigned long pid;
+	loff_t l = 0;
 
 	/*
 	 * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-	if (!pid_list || *pos >= pid_list->nr_pids)
+	if (!pid_list)
+		return NULL;
+
+	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+	if (pid >= pid_list->pid_max)
 		return NULL;
 
-	return (void *)&pid_list->pids[*pos];
+	/* Return pid + 1 so that zero can be the exit value */
+	for (pid++; pid && l < *pos;
+	     pid = (unsigned long)p_next(m, (void *)pid, &l))
+		;
+	return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct trace_array *tr = m->private;
-	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
-	(*pos)++;
-
-	if (*pos >= pid_list->nr_pids)
-		return NULL;
-
-	return (void *)&pid_list->pids[*pos];
-}
-
 static int p_show(struct seq_file *m, void *v)
 {
-	pid_t *pid = v;
+	unsigned long pid = (unsigned long)v - 1;
 
-	seq_printf(m, "%d\n", *pid);
+	seq_printf(m, "%lu\n", pid);
 	return 0;
 }
 
@@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
-static int max_pids(struct trace_pid_list *pid_list)
-{
-	return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
 static void ignore_task_cpu(void *data)
 {
 	struct trace_array *tr = data;
@@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	struct seq_file *m = filp->private_data;
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *filtered_pids = NULL;
-	struct trace_pid_list *pid_list = NULL;
+	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
 	struct trace_parser parser;
 	unsigned long val;
@@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	ssize_t read = 0;
 	ssize_t ret = 0;
 	pid_t pid;
-	int i;
+	int nr_pids = 0;
 
 	if (!cnt)
 		return 0;
@@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		return -ENOMEM;
 
 	mutex_lock(&event_mutex);
+	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+					     lockdep_is_held(&event_mutex));
+
 	/*
-	 * Load as many pids into the array before doing a
-	 * swap from the tr->filtered_pids to the new list.
+	 * Always recreate a new array. The write is an all or nothing
+	 * operation. Always create a new array when adding new pids by
+	 * the user. If the operation fails, then the current list is
+	 * not modified.
 	 */
+	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+	if (!pid_list) {
+		read = -ENOMEM;
+		goto out;
+	}
+	pid_list->pid_max = READ_ONCE(pid_max);
+	/* Only truncating will shrink pid_max */
+	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+		pid_list->pid_max = filtered_pids->pid_max;
+	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+	if (!pid_list->pids) {
+		kfree(pid_list);
+		read = -ENOMEM;
+		goto out;
+	}
+	if (filtered_pids) {
+		/* copy the current bits to the new max */
+		pid = find_first_bit(filtered_pids->pids,
+				     filtered_pids->pid_max);
+		while (pid < filtered_pids->pid_max) {
+			set_bit(pid, pid_list->pids);
+			pid = find_next_bit(filtered_pids->pids,
+					    filtered_pids->pid_max,
+					    pid + 1);
+			nr_pids++;
+		}
+	}
+
 	while (cnt > 0) {
 
 		this_pos = 0;
@@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		ret = -EINVAL;
 		if (kstrtoul(parser.buffer, 0, &val))
 			break;
-		if (val > INT_MAX)
+		if (val >= pid_list->pid_max)
 			break;
 
 		pid = (pid_t)val;
 
-		ret = -ENOMEM;
-		if (!pid_list) {
-			pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-			if (!pid_list)
-				break;
-
-			filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-							lockdep_is_held(&event_mutex));
-			if (filtered_pids)
-				pid_list->order = filtered_pids->order;
-			else
-				pid_list->order = 0;
-
-			pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
-								  pid_list->order);
-			if (!pid_list->pids)
-				break;
-
-			if (filtered_pids) {
-				pid_list->nr_pids = filtered_pids->nr_pids;
-				memcpy(pid_list->pids, filtered_pids->pids,
-				       pid_list->nr_pids * sizeof(pid_t));
-			} else
-				pid_list->nr_pids = 0;
-		}
-
-		if (pid_list->nr_pids >= max_pids(pid_list)) {
-			pid_t *pid_page;
-
-			pid_page = (void *)__get_free_pages(GFP_KERNEL,
-							    pid_list->order + 1);
-			if (!pid_page)
-				break;
-			memcpy(pid_page, pid_list->pids,
-			       pid_list->nr_pids * sizeof(pid_t));
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
-
-			pid_list->order++;
-			pid_list->pids = pid_page;
-		}
+		set_bit(pid, pid_list->pids);
+		nr_pids++;
 
-		pid_list->pids[pid_list->nr_pids++] = pid;
 		trace_parser_clear(&parser);
 		ret = 0;
 	}
 	trace_parser_put(&parser);
 
 	if (ret < 0) {
-		if (pid_list)
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
+		vfree(pid_list->pids);
 		kfree(pid_list);
-		mutex_unlock(&event_mutex);
-		return ret;
-	}
-
-	if (!pid_list) {
-		mutex_unlock(&event_mutex);
-		return ret;
+		read = ret;
+		goto out;
 	}
 
-	sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
-	/* Remove duplicates */
-	for (i = 1; i < pid_list->nr_pids; i++) {
-		int start = i;
-
-		while (i < pid_list->nr_pids &&
-		       pid_list->pids[i - 1] == pid_list->pids[i])
-			i++;
-
-		if (start != i) {
-			if (i < pid_list->nr_pids) {
-				memmove(&pid_list->pids[start], &pid_list->pids[i],
-					(pid_list->nr_pids - i) * sizeof(pid_t));
-				pid_list->nr_pids -= i - start;
-				i = start;
-			} else
-				pid_list->nr_pids = start;
-		}
+	if (!nr_pids) {
+		/* Cleared the list of pids */
+		vfree(pid_list->pids);
+		kfree(pid_list);
+		read = ret;
+		if (!filtered_pids)
+			goto out;
+		pid_list = NULL;
 	}
-
 	rcu_assign_pointer(tr->filtered_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (filtered_pids) {
 		synchronize_sched();
 
-		free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+		vfree(filtered_pids->pids);
 		kfree(filtered_pids);
 	} else {
 		/*
@@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	 */
 	on_each_cpu(ignore_task_cpu, tr, 1);
 
+ out:
 	mutex_unlock(&event_mutex);
 
 	ret = read;
-	*ppos += read;
+	if (read > 0)
+		*ppos += read;
 
 	return ret;
 }
-- 
cgit v1.2.3


From c37775d57830a36382a9774bb84eca4ce3d019cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:59:18 -0400
Subject: tracing: Add infrastructure to allow set_event_pid to follow children

Add the infrastructure needed to have the PIDs in set_event_pid to
automatically add PIDs of the children of the tasks that have their PIDs in
set_event_pid. This will also remove PIDs from set_event_pid when a task
exits

This is implemented by adding hooks into the fork and exit tracepoints. On
fork, the PIDs are added to the list, and on exit, they are removed.

Add a new option called event_fork that when set, PIDs in set_event_pid will
automatically get their children PIDs added when they fork, as well as any
task that exits will have its PID removed from set_event_pid.

This works for instances as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  3 ++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c | 84 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 79 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2f0b9f33e9b..0d12dbde8399 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3571,6 +3571,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
 
+	if (mask == TRACE_ITER_EVENT_FORK)
+		trace_event_follow_fork(tr, enabled);
+
 	if (mask == TRACE_ITER_OVERWRITE) {
 		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 68cbb8e10aea..2525042760e6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -655,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 extern cycle_t ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
+extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
@@ -966,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(STOP_ON_FREE,		"disable_on_free"),	\
 		C(IRQ_INFO,		"irq-info"),		\
 		C(MARKERS,		"markers"),		\
+		C(EVENT_FORK,		"event-fork"),		\
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f7cc72bf25..add81dff7520 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -474,11 +474,23 @@ static void ftrace_clear_events(struct trace_array *tr)
 /* Shouldn't this be in a header? */
 extern int pid_max;
 
+/* Returns true if found in filter */
 static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
-	pid_t pid;
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (search_pid >= filtered_pids->pid_max)
+		return false;
+
+	return test_bit(search_pid, filtered_pids->pids);
+}
 
+static bool
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
 	/*
 	 * Return false, because if filtered_pids does not exist,
 	 * all pids are good to trace.
@@ -486,16 +498,68 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	pid = task->pid;
+	return !find_filtered_pid(filtered_pids, task->pid);
+}
 
-	/*
-	 * If pid_max changed after filtered_pids was created, we
-	 * by default ignore all pids greater than the previous pid_max.
-	 */
-	if (task->pid >= filtered_pids->pid_max)
-		return true;
+static void filter_add_remove_task(struct trace_pid_list *pid_list,
+				   struct task_struct *self,
+				   struct task_struct *task)
+{
+	if (!pid_list)
+		return;
+
+	/* For forks, we only add if the forking task is listed */
+	if (self) {
+		if (!find_filtered_pid(pid_list, self->pid))
+			return;
+	}
+
+	/* Sorry, but we don't support pid_max changing after setting */
+	if (task->pid >= pid_list->pid_max)
+		return;
+
+	/* "self" is set for forks, and NULL for exits */
+	if (self)
+		set_bit(task->pid, pid_list->pids);
+	else
+		clear_bit(task->pid, pid_list->pids);
+}
+
+static void
+event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, NULL, task);
+}
 
-	return !test_bit(task->pid, filtered_pids->pids);
+static void
+event_filter_pid_sched_process_fork(void *data,
+				    struct task_struct *self,
+				    struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, self, task);
+}
+
+void trace_event_follow_fork(struct trace_array *tr, bool enable)
+{
+	if (enable) {
+		register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
+						       tr, INT_MIN);
+		register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+						       tr, INT_MAX);
+	} else {
+		unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
+						    tr);
+		unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+						    tr);
+	}
 }
 
 static void
-- 
cgit v1.2.3


From 08d43a5fa063e03c860f2f391a30c388bcbc948e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 10 Dec 2015 12:50:50 -0600
Subject: tracing: Add lock-free tracing_map

Add tracing_map, a special-purpose lock-free map for tracing.

tracing_map is designed to aggregate or 'sum' one or more values
associated with a specific object of type tracing_map_elt, which
is associated by the map to a given key.

It provides various hooks allowing per-tracer customization and is
separated out into a separate file in order to allow it to be shared
between multiple tracers, but isn't meant to be generally used outside
of that context.

The tracing_map implementation was inspired by lock-free map
algorithms originated by Dr. Cliff Click:

 http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
 http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf

Link: http://lkml.kernel.org/r/b43d68d1add33582a396f553c8ef705a33a6a748.1449767187.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       |   13 +
 kernel/trace/Makefile      |    1 +
 kernel/trace/tracing_map.c | 1058 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/tracing_map.h |  282 ++++++++++++
 4 files changed, 1354 insertions(+)
 create mode 100644 kernel/trace/tracing_map.c
 create mode 100644 kernel/trace/tracing_map.h

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e45db6b0d878..48b732943e0e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -528,6 +528,19 @@ config MMIOTRACE
 	  See Documentation/trace/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
 
+config TRACING_MAP
+	bool
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	default n
+	help
+	  tracing_map is a special-purpose lock-free map for tracing,
+	  separated out as a stand-alone facility in order to allow it
+	  to be shared between multiple tracers.  It isn't meant to be
+	  generally used outside of that context, and is normally
+	  selected by tracers that use it.
+
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9b1044e936a6..4255c4057aaa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_seq.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
new file mode 100644
index 000000000000..dd6401dd145e
--- /dev/null
+++ b/kernel/trace/tracing_map.c
@@ -0,0 +1,1058 @@
+/*
+ * tracing_map - lock-free map for tracing
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ *
+ * tracing_map implementation inspired by lock-free map algorithms
+ * originated by Dr. Cliff Click:
+ *
+ * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
+ * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+/*
+ * NOTE: For a detailed description of the data structures used by
+ * these functions (such as tracing_map_elt) please see the overview
+ * of tracing_map data structures at the beginning of tracing_map.h.
+ */
+
+/**
+ * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ * @n: The value to add to the sum
+ *
+ * Add n to sum i associated with the specified tracing_map_elt
+ * instance.  The index i is the index returned by the call to
+ * tracing_map_add_sum_field() when the tracing map was set up.
+ */
+void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+	atomic64_add(n, &elt->fields[i].sum);
+}
+
+/**
+ * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ *
+ * Retrieve the value of the sum i associated with the specified
+ * tracing_map_elt instance.  The index i is the index returned by the
+ * call to tracing_map_add_sum_field() when the tracing map was set
+ * up.
+ *
+ * Return: The sum associated with field i for elt.
+ */
+u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
+{
+	return (u64)atomic64_read(&elt->fields[i].sum);
+}
+
+int tracing_map_cmp_string(void *val_a, void *val_b)
+{
+	char *a = val_a;
+	char *b = val_b;
+
+	return strcmp(a, b);
+}
+
+int tracing_map_cmp_none(void *val_a, void *val_b)
+{
+	return 0;
+}
+
+static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
+{
+	u64 a = atomic64_read((atomic64_t *)val_a);
+	u64 b = atomic64_read((atomic64_t *)val_b);
+
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);
+}
+
+#define DEFINE_TRACING_MAP_CMP_FN(type)					\
+static int tracing_map_cmp_##type(void *val_a, void *val_b)		\
+{									\
+	type a = *(type *)val_a;					\
+	type b = *(type *)val_b;					\
+									\
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);			\
+}
+
+DEFINE_TRACING_MAP_CMP_FN(s64);
+DEFINE_TRACING_MAP_CMP_FN(u64);
+DEFINE_TRACING_MAP_CMP_FN(s32);
+DEFINE_TRACING_MAP_CMP_FN(u32);
+DEFINE_TRACING_MAP_CMP_FN(s16);
+DEFINE_TRACING_MAP_CMP_FN(u16);
+DEFINE_TRACING_MAP_CMP_FN(s8);
+DEFINE_TRACING_MAP_CMP_FN(u8);
+
+tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+					 int field_is_signed)
+{
+	tracing_map_cmp_fn_t fn = tracing_map_cmp_none;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s64;
+		else
+			fn = tracing_map_cmp_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s32;
+		else
+			fn = tracing_map_cmp_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s16;
+		else
+			fn = tracing_map_cmp_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s8;
+		else
+			fn = tracing_map_cmp_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int tracing_map_add_field(struct tracing_map *map,
+				 tracing_map_cmp_fn_t cmp_fn)
+{
+	int ret = -EINVAL;
+
+	if (map->n_fields < TRACING_MAP_FIELDS_MAX) {
+		ret = map->n_fields;
+		map->fields[map->n_fields++].cmp_fn = cmp_fn;
+	}
+
+	return ret;
+}
+
+/**
+ * tracing_map_add_sum_field - Add a field describing a tracing_map sum
+ * @map: The tracing_map
+ *
+ * Add a sum field to the key and return the index identifying it in
+ * the map and associated tracing_map_elts.  This is the index used
+ * for instance to update a sum for a particular tracing_map_elt using
+ * tracing_map_update_sum() or reading it via tracing_map_read_sum().
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_sum_field(struct tracing_map *map)
+{
+	return tracing_map_add_field(map, tracing_map_cmp_atomic64);
+}
+
+/**
+ * tracing_map_add_key_field - Add a field describing a tracing_map key
+ * @map: The tracing_map
+ * @offset: The offset within the key
+ * @cmp_fn: The comparison function that will be used to sort on the key
+ *
+ * Let the map know there is a key and that if it's used as a sort key
+ * to use cmp_fn.
+ *
+ * A key can be a subset of a compound key; for that purpose, the
+ * offset param is used to describe where within the the compound key
+ * the key referenced by this key field resides.
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_key_field(struct tracing_map *map,
+			      unsigned int offset,
+			      tracing_map_cmp_fn_t cmp_fn)
+
+{
+	int idx = tracing_map_add_field(map, cmp_fn);
+
+	if (idx < 0)
+		return idx;
+
+	map->fields[idx].offset = offset;
+
+	map->key_idx[map->n_keys++] = idx;
+
+	return idx;
+}
+
+void tracing_map_array_clear(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a->pages)
+		return;
+
+	for (i = 0; i < a->n_pages; i++)
+		memset(a->pages[i], 0, PAGE_SIZE);
+}
+
+void tracing_map_array_free(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a)
+		return;
+
+	if (!a->pages) {
+		kfree(a);
+		return;
+	}
+
+	for (i = 0; i < a->n_pages; i++) {
+		if (!a->pages[i])
+			break;
+		free_page((unsigned long)a->pages[i]);
+	}
+}
+
+struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
+						  unsigned int entry_size)
+{
+	struct tracing_map_array *a;
+	unsigned int i;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return NULL;
+
+	a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+	a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
+	a->n_pages = n_elts / a->entries_per_page;
+	if (!a->n_pages)
+		a->n_pages = 1;
+	a->entry_shift = fls(a->entries_per_page) - 1;
+	a->entry_mask = (1 << a->entry_shift) - 1;
+
+	a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
+	if (!a->pages)
+		goto free;
+
+	for (i = 0; i < a->n_pages; i++) {
+		a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!a->pages[i])
+			goto free;
+	}
+ out:
+	return a;
+ free:
+	tracing_map_array_free(a);
+	a = NULL;
+
+	goto out;
+}
+
+static void tracing_map_elt_clear(struct tracing_map_elt *elt)
+{
+	unsigned i;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
+			atomic64_set(&elt->fields[i].sum, 0);
+
+	if (elt->map->ops && elt->map->ops->elt_clear)
+		elt->map->ops->elt_clear(elt);
+}
+
+static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
+{
+	unsigned int i;
+
+	tracing_map_elt_clear(elt);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn;
+
+		if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64)
+			elt->fields[i].offset = elt->map->fields[i].offset;
+	}
+}
+
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+	if (!elt)
+		return;
+
+	if (elt->map->ops && elt->map->ops->elt_free)
+		elt->map->ops->elt_free(elt);
+	kfree(elt->fields);
+	kfree(elt->key);
+	kfree(elt);
+}
+
+static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt;
+	int err = 0;
+
+	elt = kzalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return ERR_PTR(-ENOMEM);
+
+	elt->map = map;
+
+	elt->key = kzalloc(map->key_size, GFP_KERNEL);
+	if (!elt->key) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL);
+	if (!elt->fields) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	tracing_map_elt_init_fields(elt);
+
+	if (map->ops && map->ops->elt_alloc) {
+		err = map->ops->elt_alloc(elt);
+		if (err)
+			goto free;
+	}
+	return elt;
+ free:
+	tracing_map_elt_free(elt);
+
+	return ERR_PTR(err);
+}
+
+static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt = NULL;
+	int idx;
+
+	idx = atomic_inc_return(&map->next_elt);
+	if (idx < map->max_elts) {
+		elt = *(TRACING_MAP_ELT(map->elts, idx));
+		if (map->ops && map->ops->elt_init)
+			map->ops->elt_init(elt);
+	}
+
+	return elt;
+}
+
+static void tracing_map_free_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	if (!map->elts)
+		return;
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+
+	tracing_map_array_free(map->elts);
+}
+
+static int tracing_map_alloc_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	map->elts = tracing_map_array_alloc(map->max_elts,
+					    sizeof(struct tracing_map_elt *));
+	if (!map->elts)
+		return -ENOMEM;
+
+	for (i = 0; i < map->max_elts; i++) {
+		*(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
+		if (!(*(TRACING_MAP_ELT(map->elts, i)))) {
+			tracing_map_free_elts(map);
+
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static inline bool keys_match(void *key, void *test_key, unsigned key_size)
+{
+	bool match = true;
+
+	if (memcmp(key, test_key, key_size))
+		match = false;
+
+	return match;
+}
+
+static inline struct tracing_map_elt *
+__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+{
+	u32 idx, key_hash, test_key;
+	struct tracing_map_entry *entry;
+
+	key_hash = jhash(key, map->key_size, 0);
+	if (key_hash == 0)
+		key_hash = 1;
+	idx = key_hash >> (32 - (map->map_bits + 1));
+
+	while (1) {
+		idx &= (map->map_size - 1);
+		entry = TRACING_MAP_ENTRY(map->map, idx);
+		test_key = entry->key;
+
+		if (test_key && test_key == key_hash && entry->val &&
+		    keys_match(key, entry->val->key, map->key_size)) {
+			atomic64_inc(&map->hits);
+			return entry->val;
+		}
+
+		if (!test_key) {
+			if (lookup_only)
+				break;
+
+			if (!cmpxchg(&entry->key, 0, key_hash)) {
+				struct tracing_map_elt *elt;
+
+				elt = get_free_elt(map);
+				if (!elt) {
+					atomic64_inc(&map->drops);
+					entry->key = 0;
+					break;
+				}
+
+				memcpy(elt->key, key, map->key_size);
+				entry->val = elt;
+				atomic64_inc(&map->hits);
+
+				return entry->val;
+			}
+		}
+
+		idx++;
+	}
+
+	return NULL;
+}
+
+/**
+ * tracing_map_insert - Insert key and/or retrieve val from a tracing_map
+ * @map: The tracing_map to insert into
+ * @key: The key to insert
+ *
+ * Inserts a key into a tracing_map and creates and returns a new
+ * tracing_map_elt for it, or if the key has already been inserted by
+ * a previous call, returns the tracing_map_elt already associated
+ * with it.  When the map was created, the number of elements to be
+ * allocated for the map was specified (internally maintained as
+ * 'max_elts' in struct tracing_map), and that number of
+ * tracing_map_elts was created by tracing_map_init().  This is the
+ * pre-allocated pool of tracing_map_elts that tracing_map_insert()
+ * will allocate from when adding new keys.  Once that pool is
+ * exhausted, tracing_map_insert() is useless and will return NULL to
+ * signal that state.  There are two user-visible tracing_map
+ * variables, 'hits' and 'drops', which are updated by this function.
+ * Every time an element is either successfully inserted or retrieved,
+ * the 'hits' value is incrememented.  Every time an element insertion
+ * fails, the 'drops' value is incremented.
+ *
+ * This is a lock-free tracing map insertion function implementing a
+ * modified form of Cliff Click's basic insertion algorithm.  It
+ * requires the table size be a power of two.  To prevent any
+ * possibility of an infinite loop we always make the internal table
+ * size double the size of the requested table size (max_elts * 2).
+ * Likewise, we never reuse a slot or resize or delete elements - when
+ * we've reached max_elts entries, we simply return NULL once we've
+ * run out of entries.  Readers can at any point in time traverse the
+ * tracing map and safely access the key/val pairs.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If this was a newly inserted key, the val will be a newly allocated
+ * and associated tracing_map_elt pointer val.  If the key wasn't
+ * found and the pool of tracing_map_elts has been exhausted, NULL is
+ * returned and no further insertions will succeed.
+ */
+struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, false);
+}
+
+/**
+ * tracing_map_lookup - Retrieve val from a tracing_map
+ * @map: The tracing_map to perform the lookup on
+ * @key: The key to look up
+ *
+ * Looks up key in tracing_map and if found returns the matching
+ * tracing_map_elt.  This is a lock-free lookup; see
+ * tracing_map_insert() for details on tracing_map and how it works.
+ * Every time an element is retrieved, the 'hits' value is
+ * incrememented.  There is one user-visible tracing_map variable,
+ * 'hits', which is updated by this function.  Every time an element
+ * is successfully retrieved, the 'hits' value is incrememented.  The
+ * 'drops' value is never updated by this function.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If the key wasn't found, NULL is returned.
+ */
+struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, true);
+}
+
+/**
+ * tracing_map_destroy - Destroy a tracing_map
+ * @map: The tracing_map to destroy
+ *
+ * Frees a tracing_map along with its associated array of
+ * tracing_map_elts.
+ *
+ * Callers should make sure there are no readers or writers actively
+ * reading or inserting into the map before calling this.
+ */
+void tracing_map_destroy(struct tracing_map *map)
+{
+	if (!map)
+		return;
+
+	tracing_map_free_elts(map);
+
+	tracing_map_array_free(map->map);
+	kfree(map);
+}
+
+/**
+ * tracing_map_clear - Clear a tracing_map
+ * @map: The tracing_map to clear
+ *
+ * Resets the tracing map to a cleared or initial state.  The
+ * tracing_map_elts are all cleared, and the array of struct
+ * tracing_map_entry is reset to an initialized state.
+ *
+ * Callers should make sure there are no writers actively inserting
+ * into the map before calling this.
+ */
+void tracing_map_clear(struct tracing_map *map)
+{
+	unsigned int i;
+
+	atomic_set(&map->next_elt, -1);
+	atomic64_set(&map->hits, 0);
+	atomic64_set(&map->drops, 0);
+
+	tracing_map_array_clear(map->map);
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i)));
+}
+
+static void set_sort_key(struct tracing_map *map,
+			 struct tracing_map_sort_key *sort_key)
+{
+	map->sort_key = *sort_key;
+}
+
+/**
+ * tracing_map_create - Create a lock-free map and element pool
+ * @map_bits: The size of the map (2 ** map_bits)
+ * @key_size: The size of the key for the map in bytes
+ * @ops: Optional client-defined tracing_map_ops instance
+ * @private_data: Client data associated with the map
+ *
+ * Creates and sets up a map to contain 2 ** map_bits number of
+ * elements (internally maintained as 'max_elts' in struct
+ * tracing_map).  Before using, map fields should be added to the map
+ * with tracing_map_add_sum_field() and tracing_map_add_key_field().
+ * tracing_map_init() should then be called to allocate the array of
+ * tracing_map_elts, in order to avoid allocating anything in the map
+ * insertion path.  The user-specified map size reflects the maximum
+ * number of elements that can be contained in the table requested by
+ * the user - internally we double that in order to keep the table
+ * sparse and keep collisions manageable.
+ *
+ * A tracing_map is a special-purpose map designed to aggregate or
+ * 'sum' one or more values associated with a specific object of type
+ * tracing_map_elt, which is attached by the map to a given key.
+ *
+ * tracing_map_create() sets up the map itself, and provides
+ * operations for inserting tracing_map_elts, but doesn't allocate the
+ * tracing_map_elts themselves, or provide a means for describing the
+ * keys or sums associated with the tracing_map_elts.  All
+ * tracing_map_elts for a given map have the same set of sums and
+ * keys, which are defined by the client using the functions
+ * tracing_map_add_key_field() and tracing_map_add_sum_field().  Once
+ * the fields are defined, the pool of elements allocated for the map
+ * can be created, which occurs when the client code calls
+ * tracing_map_init().
+ *
+ * When tracing_map_init() returns, tracing_map_elt elements can be
+ * inserted into the map using tracing_map_insert().  When called,
+ * tracing_map_insert() grabs a free tracing_map_elt from the pool, or
+ * finds an existing match in the map and in either case returns it.
+ * The client can then use tracing_map_update_sum() and
+ * tracing_map_read_sum() to update or read a given sum field for the
+ * tracing_map_elt.
+ *
+ * The client can at any point retrieve and traverse the current set
+ * of inserted tracing_map_elts in a tracing_map, via
+ * tracing_map_sort_entries().  Sorting can be done on any field,
+ * including keys.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+struct tracing_map *tracing_map_create(unsigned int map_bits,
+				       unsigned int key_size,
+				       const struct tracing_map_ops *ops,
+				       void *private_data)
+{
+	struct tracing_map *map;
+	unsigned int i;
+
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->map_bits = map_bits;
+	map->max_elts = (1 << map_bits);
+	atomic_set(&map->next_elt, -1);
+
+	map->map_size = (1 << (map_bits + 1));
+	map->ops = ops;
+
+	map->private_data = private_data;
+
+	map->map = tracing_map_array_alloc(map->map_size,
+					   sizeof(struct tracing_map_entry));
+	if (!map->map)
+		goto free;
+
+	map->key_size = key_size;
+	for (i = 0; i < TRACING_MAP_KEYS_MAX; i++)
+		map->key_idx[i] = -1;
+ out:
+	return map;
+ free:
+	tracing_map_destroy(map);
+	map = ERR_PTR(-ENOMEM);
+
+	goto out;
+}
+
+/**
+ * tracing_map_init - Allocate and clear a map's tracing_map_elts
+ * @map: The tracing_map to initialize
+ *
+ * Allocates a clears a pool of tracing_map_elts equal to the
+ * user-specified size of 2 ** map_bits (internally maintained as
+ * 'max_elts' in struct tracing_map).  Before using, the map fields
+ * should be added to the map with tracing_map_add_sum_field() and
+ * tracing_map_add_key_field().  tracing_map_init() should then be
+ * called to allocate the array of tracing_map_elts, in order to avoid
+ * allocating anything in the map insertion path.  The user-specified
+ * map size reflects the max number of elements requested by the user
+ * - internally we double that in order to keep the table sparse and
+ * keep collisions manageable.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+int tracing_map_init(struct tracing_map *map)
+{
+	int err;
+
+	if (map->n_fields < 2)
+		return -EINVAL; /* need at least 1 key and 1 val */
+
+	err = tracing_map_alloc_elts(map);
+	if (err)
+		return err;
+
+	tracing_map_clear(map);
+
+	return err;
+}
+
+static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	int ret = 0;
+
+	if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+		ret = 1;
+
+	return ret;
+}
+
+static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+	cmp_fn = field->cmp_fn;
+
+	val_a = &elt_a->fields[sort_key->field_idx].sum;
+	val_b = &elt_b->fields[sort_key->field_idx].sum;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static int cmp_entries_key(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+
+	cmp_fn = field->cmp_fn;
+
+	val_a = elt_a->key + field->offset;
+	val_b = elt_b->key + field->offset;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static void destroy_sort_entry(struct tracing_map_sort_entry *entry)
+{
+	if (!entry)
+		return;
+
+	if (entry->elt_copied)
+		tracing_map_elt_free(entry->elt);
+
+	kfree(entry);
+}
+
+/**
+ * tracing_map_destroy_sort_entries - Destroy an array of sort entries
+ * @entries: The entries to destroy
+ * @n_entries: The number of entries in the array
+ *
+ * Destroy the elements returned by a tracing_map_sort_entries() call.
+ */
+void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+				      unsigned int n_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_entries; i++)
+		destroy_sort_entry(entries[i]);
+
+	vfree(entries);
+}
+
+static struct tracing_map_sort_entry *
+create_sort_entry(void *key, struct tracing_map_elt *elt)
+{
+	struct tracing_map_sort_entry *sort_entry;
+
+	sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+	if (!sort_entry)
+		return NULL;
+
+	sort_entry->key = key;
+	sort_entry->elt = elt;
+
+	return sort_entry;
+}
+
+static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
+{
+	struct tracing_map_elt *dup_elt;
+	unsigned int i;
+
+	dup_elt = tracing_map_elt_alloc(elt->map);
+	if (!dup_elt)
+		return NULL;
+
+	if (elt->map->ops && elt->map->ops->elt_copy)
+		elt->map->ops->elt_copy(dup_elt, elt);
+
+	dup_elt->private_data = elt->private_data;
+	memcpy(dup_elt->key, elt->key, elt->map->key_size);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		atomic64_set(&dup_elt->fields[i].sum,
+			     atomic64_read(&elt->fields[i].sum));
+		dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
+	}
+
+	return dup_elt;
+}
+
+static int merge_dup(struct tracing_map_sort_entry **sort_entries,
+		     unsigned int target, unsigned int dup)
+{
+	struct tracing_map_elt *target_elt, *elt;
+	bool first_dup = (target - dup) == 1;
+	int i;
+
+	if (first_dup) {
+		elt = sort_entries[target]->elt;
+		target_elt = copy_elt(elt);
+		if (!target_elt)
+			return -ENOMEM;
+		sort_entries[target]->elt = target_elt;
+		sort_entries[target]->elt_copied = true;
+	} else
+		target_elt = sort_entries[target]->elt;
+
+	elt = sort_entries[dup]->elt;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		atomic64_add(atomic64_read(&elt->fields[i].sum),
+			     &target_elt->fields[i].sum);
+
+	sort_entries[dup]->dup = true;
+
+	return 0;
+}
+
+static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+		      int n_entries, unsigned int key_size)
+{
+	unsigned int dups = 0, total_dups = 0;
+	int err, i, j;
+	void *key;
+
+	if (n_entries < 2)
+		return total_dups;
+
+	sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+	     (int (*)(const void *, const void *))cmp_entries_dup, NULL);
+
+	key = sort_entries[0]->key;
+	for (i = 1; i < n_entries; i++) {
+		if (!memcmp(sort_entries[i]->key, key, key_size)) {
+			dups++; total_dups++;
+			err = merge_dup(sort_entries, i - dups, i);
+			if (err)
+				return err;
+			continue;
+		}
+		key = sort_entries[i]->key;
+		dups = 0;
+	}
+
+	if (!total_dups)
+		return total_dups;
+
+	for (i = 0, j = 0; i < n_entries; i++) {
+		if (!sort_entries[i]->dup) {
+			sort_entries[j] = sort_entries[i];
+			if (j++ != i)
+				sort_entries[i] = NULL;
+		} else {
+			destroy_sort_entry(sort_entries[i]);
+			sort_entries[i] = NULL;
+		}
+	}
+
+	return total_dups;
+}
+
+static bool is_key(struct tracing_map *map, unsigned int field_idx)
+{
+	unsigned int i;
+
+	for (i = 0; i < map->n_keys; i++)
+		if (map->key_idx[i] == field_idx)
+			return true;
+	return false;
+}
+
+static void sort_secondary(struct tracing_map *map,
+			   const struct tracing_map_sort_entry **entries,
+			   unsigned int n_entries,
+			   struct tracing_map_sort_key *primary_key,
+			   struct tracing_map_sort_key *secondary_key)
+{
+	int (*primary_fn)(const struct tracing_map_sort_entry **,
+			  const struct tracing_map_sort_entry **);
+	int (*secondary_fn)(const struct tracing_map_sort_entry **,
+			    const struct tracing_map_sort_entry **);
+	unsigned i, start = 0, n_sub = 1;
+
+	if (is_key(map, primary_key->field_idx))
+		primary_fn = cmp_entries_key;
+	else
+		primary_fn = cmp_entries_sum;
+
+	if (is_key(map, secondary_key->field_idx))
+		secondary_fn = cmp_entries_key;
+	else
+		secondary_fn = cmp_entries_sum;
+
+	for (i = 0; i < n_entries - 1; i++) {
+		const struct tracing_map_sort_entry **a = &entries[i];
+		const struct tracing_map_sort_entry **b = &entries[i + 1];
+
+		if (primary_fn(a, b) == 0) {
+			n_sub++;
+			if (i < n_entries - 2)
+				continue;
+		}
+
+		if (n_sub < 2) {
+			start = i + 1;
+			n_sub = 1;
+			continue;
+		}
+
+		set_sort_key(map, secondary_key);
+		sort(&entries[start], n_sub,
+		     sizeof(struct tracing_map_sort_entry *),
+		     (int (*)(const void *, const void *))secondary_fn, NULL);
+		set_sort_key(map, primary_key);
+
+		start = i + 1;
+		n_sub = 1;
+	}
+}
+
+/**
+ * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
+ * @map: The tracing_map
+ * @sort_key: The sort key to use for sorting
+ * @sort_entries: outval: pointer to allocated and sorted array of entries
+ *
+ * tracing_map_sort_entries() sorts the current set of entries in the
+ * map and returns the list of tracing_map_sort_entries containing
+ * them to the client in the sort_entries param.  The client can
+ * access the struct tracing_map_elt element of interest directly as
+ * the 'elt' field of a returned struct tracing_map_sort_entry object.
+ *
+ * The sort_key has only two fields: idx and descending.  'idx' refers
+ * to the index of the field added via tracing_map_add_sum_field() or
+ * tracing_map_add_key_field() when the tracing_map was initialized.
+ * 'descending' is a flag that if set reverses the sort order, which
+ * by default is ascending.
+ *
+ * The client should not hold on to the returned array but should use
+ * it and call tracing_map_destroy_sort_entries() when done.
+ *
+ * Return: the number of sort_entries in the struct tracing_map_sort_entry
+ * array, negative on error
+ */
+int tracing_map_sort_entries(struct tracing_map *map,
+			     struct tracing_map_sort_key *sort_keys,
+			     unsigned int n_sort_keys,
+			     struct tracing_map_sort_entry ***sort_entries)
+{
+	int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
+			      const struct tracing_map_sort_entry **);
+	struct tracing_map_sort_entry *sort_entry, **entries;
+	int i, n_entries, ret;
+
+	entries = vmalloc(map->max_elts * sizeof(sort_entry));
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0, n_entries = 0; i < map->map_size; i++) {
+		struct tracing_map_entry *entry;
+
+		entry = TRACING_MAP_ENTRY(map->map, i);
+
+		if (!entry->key || !entry->val)
+			continue;
+
+		entries[n_entries] = create_sort_entry(entry->val->key,
+						       entry->val);
+		if (!entries[n_entries++]) {
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	if (n_entries == 0) {
+		ret = 0;
+		goto free;
+	}
+
+	if (n_entries == 1) {
+		*sort_entries = entries;
+		return 1;
+	}
+
+	ret = merge_dups(entries, n_entries, map->key_size);
+	if (ret < 0)
+		goto free;
+	n_entries -= ret;
+
+	if (is_key(map, sort_keys[0].field_idx))
+		cmp_entries_fn = cmp_entries_key;
+	else
+		cmp_entries_fn = cmp_entries_sum;
+
+	set_sort_key(map, &sort_keys[0]);
+
+	sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+	     (int (*)(const void *, const void *))cmp_entries_fn, NULL);
+
+	if (n_sort_keys > 1)
+		sort_secondary(map,
+			       (const struct tracing_map_sort_entry **)entries,
+			       n_entries,
+			       &sort_keys[0],
+			       &sort_keys[1]);
+
+	*sort_entries = entries;
+
+	return n_entries;
+ free:
+	tracing_map_destroy_sort_entries(entries, n_entries);
+
+	return ret;
+}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
new file mode 100644
index 000000000000..75718255a8ad
--- /dev/null
+++ b/kernel/trace/tracing_map.h
@@ -0,0 +1,282 @@
+#ifndef __TRACING_MAP_H
+#define __TRACING_MAP_H
+
+#define TRACING_MAP_BITS_DEFAULT	11
+#define TRACING_MAP_BITS_MAX		17
+#define TRACING_MAP_BITS_MIN		7
+
+#define TRACING_MAP_FIELDS_MAX		4
+#define TRACING_MAP_KEYS_MAX		2
+
+#define TRACING_MAP_SORT_KEYS_MAX	2
+
+typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
+
+/*
+ * This is an overview of the tracing_map data structures and how they
+ * relate to the tracing_map API.  The details of the algorithms
+ * aren't discussed here - this is just a general overview of the data
+ * structures and how they interact with the API.
+ *
+ * The central data structure of the tracing_map is an initially
+ * zeroed array of struct tracing_map_entry (stored in the map field
+ * of struct tracing_map).  tracing_map_entry is a very simple data
+ * structure containing only two fields: a 32-bit unsigned 'key'
+ * variable and a pointer named 'val'.  This array of struct
+ * tracing_map_entry is essentially a hash table which will be
+ * modified by a single function, tracing_map_insert(), but which can
+ * be traversed and read by a user at any time (though the user does
+ * this indirectly via an array of tracing_map_sort_entry - see the
+ * explanation of that data structure in the discussion of the
+ * sorting-related data structures below).
+ *
+ * The central function of the tracing_map API is
+ * tracing_map_insert().  tracing_map_insert() hashes the
+ * arbitrarily-sized key passed into it into a 32-bit unsigned key.
+ * It then uses this key, truncated to the array size, as an index
+ * into the array of tracing_map_entries.  If the value of the 'key'
+ * field of the tracing_map_entry found at that location is 0, then
+ * that entry is considered to be free and can be claimed, by
+ * replacing the 0 in the 'key' field of the tracing_map_entry with
+ * the new 32-bit hashed key.  Once claimed, that tracing_map_entry's
+ * 'val' field is then used to store a unique element which will be
+ * forever associated with that 32-bit hashed key in the
+ * tracing_map_entry.
+ *
+ * That unique element now in the tracing_map_entry's 'val' field is
+ * an instance of tracing_map_elt, where 'elt' in the latter part of
+ * that variable name is short for 'element'.  The purpose of a
+ * tracing_map_elt is to hold values specific to the the particular
+ * 32-bit hashed key it's assocated with.  Things such as the unique
+ * set of aggregated sums associated with the 32-bit hashed key, along
+ * with a copy of the full key associated with the entry, and which
+ * was used to produce the 32-bit hashed key.
+ *
+ * When tracing_map_create() is called to create the tracing map, the
+ * user specifies (indirectly via the map_bits param, the details are
+ * unimportant for this discussion) the maximum number of elements
+ * that the map can hold (stored in the max_elts field of struct
+ * tracing_map).  This is the maximum possible number of
+ * tracing_map_entries in the tracing_map_entry array which can be
+ * 'claimed' as described in the above discussion, and therefore is
+ * also the maximum number of tracing_map_elts that can be associated
+ * with the tracing_map_entry array in the tracing_map.  Because of
+ * the way the insertion algorithm works, the size of the allocated
+ * tracing_map_entry array is always twice the maximum number of
+ * elements (2 * max_elts).  This value is stored in the map_size
+ * field of struct tracing_map.
+ *
+ * Because tracing_map_insert() needs to work from any context,
+ * including from within the memory allocation functions themselves,
+ * both the tracing_map_entry array and a pool of max_elts
+ * tracing_map_elts are pre-allocated before any call is made to
+ * tracing_map_insert().
+ *
+ * The tracing_map_entry array is allocated as a single block by
+ * tracing_map_create().
+ *
+ * Because the tracing_map_elts are much larger objects and can't
+ * generally be allocated together as a single large array without
+ * failure, they're allocated individually, by tracing_map_init().
+ *
+ * The pool of tracing_map_elts are allocated by tracing_map_init()
+ * rather than by tracing_map_create() because at the time
+ * tracing_map_create() is called, there isn't enough information to
+ * create the tracing_map_elts.  Specifically,the user first needs to
+ * tell the tracing_map implementation how many fields the
+ * tracing_map_elts contain, and which types of fields they are (key
+ * or sum).  The user does this via the tracing_map_add_sum_field()
+ * and tracing_map_add_key_field() functions, following which the user
+ * calls tracing_map_init() to finish up the tracing map setup.  The
+ * array holding the pointers which make up the pre-allocated pool of
+ * tracing_map_elts is allocated as a single block and is stored in
+ * the elts field of struct tracing_map.
+ *
+ * There is also a set of structures used for sorting that might
+ * benefit from some minimal explanation.
+ *
+ * struct tracing_map_sort_key is used to drive the sort at any given
+ * time.  By 'any given time' we mean that a different
+ * tracing_map_sort_key will be used at different times depending on
+ * whether the sort currently being performed is a primary or a
+ * secondary sort.
+ *
+ * The sort key is very simple, consisting of the field index of the
+ * tracing_map_elt field to sort on (which the user saved when adding
+ * the field), and whether the sort should be done in an ascending or
+ * descending order.
+ *
+ * For the convenience of the sorting code, a tracing_map_sort_entry
+ * is created for each tracing_map_elt, again individually allocated
+ * to avoid failures that might be expected if allocated as a single
+ * large array of struct tracing_map_sort_entry.
+ * tracing_map_sort_entry instances are the objects expected by the
+ * various internal sorting functions, and are also what the user
+ * ultimately receives after calling tracing_map_sort_entries().
+ * Because it doesn't make sense for users to access an unordered and
+ * sparsely populated tracing_map directly, the
+ * tracing_map_sort_entries() function is provided so that users can
+ * retrieve a sorted list of all existing elements.  In addition to
+ * the associated tracing_map_elt 'elt' field contained within the
+ * tracing_map_sort_entry, which is the object of interest to the
+ * user, tracing_map_sort_entry objects contain a number of additional
+ * fields which are used for caching and internal purposes and can
+ * safely be ignored.
+*/
+
+struct tracing_map_field {
+	tracing_map_cmp_fn_t		cmp_fn;
+	union {
+		atomic64_t			sum;
+		unsigned int			offset;
+	};
+};
+
+struct tracing_map_elt {
+	struct tracing_map		*map;
+	struct tracing_map_field	*fields;
+	void				*key;
+	void				*private_data;
+};
+
+struct tracing_map_entry {
+	u32				key;
+	struct tracing_map_elt		*val;
+};
+
+struct tracing_map_sort_key {
+	unsigned int			field_idx;
+	bool				descending;
+};
+
+struct tracing_map_sort_entry {
+	void				*key;
+	struct tracing_map_elt		*elt;
+	bool				elt_copied;
+	bool				dup;
+};
+
+struct tracing_map_array {
+	unsigned int entries_per_page;
+	unsigned int entry_size_shift;
+	unsigned int entry_shift;
+	unsigned int entry_mask;
+	unsigned int n_pages;
+	void **pages;
+};
+
+#define TRACING_MAP_ARRAY_ELT(array, idx)				\
+	(array->pages[idx >> array->entry_shift] +			\
+	 ((idx & array->entry_mask) << array->entry_size_shift))
+
+#define TRACING_MAP_ENTRY(array, idx)					\
+	((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx))
+
+#define TRACING_MAP_ELT(array, idx)					\
+	((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx))
+
+struct tracing_map {
+	unsigned int			key_size;
+	unsigned int			map_bits;
+	unsigned int			map_size;
+	unsigned int			max_elts;
+	atomic_t			next_elt;
+	struct tracing_map_array	*elts;
+	struct tracing_map_array	*map;
+	const struct tracing_map_ops	*ops;
+	void				*private_data;
+	struct tracing_map_field	fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_fields;
+	int				key_idx[TRACING_MAP_KEYS_MAX];
+	unsigned int			n_keys;
+	struct tracing_map_sort_key	sort_key;
+	atomic64_t			hits;
+	atomic64_t			drops;
+};
+
+/**
+ * struct tracing_map_ops - callbacks for tracing_map
+ *
+ * The methods in this structure define callback functions for various
+ * operations on a tracing_map or objects related to a tracing_map.
+ *
+ * For a detailed description of tracing_map_elt objects please see
+ * the overview of tracing_map data structures at the beginning of
+ * this file.
+ *
+ * All the methods below are optional.
+ *
+ * @elt_alloc: When a tracing_map_elt is allocated, this function, if
+ *	defined, will be called and gives clients the opportunity to
+ *	allocate additional data and attach it to the element
+ *	(tracing_map_elt->private_data is meant for that purpose).
+ *	Element allocation occurs before tracing begins, when the
+ *	tracing_map_init() call is made by client code.
+ *
+ * @elt_copy: At certain points in the lifetime of an element, it may
+ *	need to be copied.  The copy should include a copy of the
+ *	client-allocated data, which can be copied into the 'to'
+ *	element from the 'from' element.
+ *
+ * @elt_free: When a tracing_map_elt is freed, this function is called
+ *	and allows client-allocated per-element data to be freed.
+ *
+ * @elt_clear: This callback allows per-element client-defined data to
+ *	be cleared, if applicable.
+ *
+ * @elt_init: This callback allows per-element client-defined data to
+ *	be initialized when used i.e. when the element is actually
+ *	claimed by tracing_map_insert() in the context of the map
+ *	insertion.
+ */
+struct tracing_map_ops {
+	int			(*elt_alloc)(struct tracing_map_elt *elt);
+	void			(*elt_copy)(struct tracing_map_elt *to,
+					    struct tracing_map_elt *from);
+	void			(*elt_free)(struct tracing_map_elt *elt);
+	void			(*elt_clear)(struct tracing_map_elt *elt);
+	void			(*elt_init)(struct tracing_map_elt *elt);
+};
+
+extern struct tracing_map *
+tracing_map_create(unsigned int map_bits,
+		   unsigned int key_size,
+		   const struct tracing_map_ops *ops,
+		   void *private_data);
+extern int tracing_map_init(struct tracing_map *map);
+
+extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_key_field(struct tracing_map *map,
+				     unsigned int offset,
+				     tracing_map_cmp_fn_t cmp_fn);
+
+extern void tracing_map_destroy(struct tracing_map *map);
+extern void tracing_map_clear(struct tracing_map *map);
+
+extern struct tracing_map_elt *
+tracing_map_insert(struct tracing_map *map, void *key);
+extern struct tracing_map_elt *
+tracing_map_lookup(struct tracing_map *map, void *key);
+
+extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+						int field_is_signed);
+extern int tracing_map_cmp_string(void *val_a, void *val_b);
+extern int tracing_map_cmp_none(void *val_a, void *val_b);
+
+extern void tracing_map_update_sum(struct tracing_map_elt *elt,
+				   unsigned int i, u64 n);
+extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern void tracing_map_set_field_descr(struct tracing_map *map,
+					unsigned int i,
+					unsigned int key_offset,
+					tracing_map_cmp_fn_t cmp_fn);
+extern int
+tracing_map_sort_entries(struct tracing_map *map,
+			 struct tracing_map_sort_key *sort_keys,
+			 unsigned int n_sort_keys,
+			 struct tracing_map_sort_entry ***sort_entries);
+
+extern void
+tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+				 unsigned int n_entries);
+#endif /* __TRACING_MAP_H */
-- 
cgit v1.2.3


From 8d44f2f34fb50f40185ef6404b0047223a27ba82 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 19 Feb 2016 14:47:00 -0500
Subject: tracing: Fix TRACING_MAP Kconfig

The config option for TRACING_MAP has "default n", which is not needed
because the default of configs is 'n'.

Also, since the TRACING_MAP has no config prompt, there's no reason to
include "If in doubt, say N" in the help text.

Fixed a typo in the comments of tracing_map.h.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       | 3 ---
 kernel/trace/tracing_map.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 48b732943e0e..d39556fd863a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -531,7 +531,6 @@ config MMIOTRACE
 config TRACING_MAP
 	bool
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
-	default n
 	help
 	  tracing_map is a special-purpose lock-free map for tracing,
 	  separated out as a stand-alone facility in order to allow it
@@ -539,8 +538,6 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
-	  If in doubt, say N.
-
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 75718255a8ad..1f7eda548787 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -46,7 +46,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
  * That unique element now in the tracing_map_entry's 'val' field is
  * an instance of tracing_map_elt, where 'elt' in the latter part of
  * that variable name is short for 'element'.  The purpose of a
- * tracing_map_elt is to hold values specific to the the particular
+ * tracing_map_elt is to hold values specific to the particular
  * 32-bit hashed key it's assocated with.  Things such as the unique
  * set of aggregated sums associated with the 32-bit hashed key, along
  * with a copy of the full key associated with the entry, and which
-- 
cgit v1.2.3


From 3b772b96b8338bca2532839b2cd7802800e66037 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:41 -0600
Subject: tracing: Update some tracing_map constants and comments

Make it clear exactly how many keys and values are supported through
better defines, and add 1 to the vals count, since normally clients
want support for at least a hitcount and two other values.

Also, note the error return value for tracing_map_add_key/val_field()
in the comments.

Link: http://lkml.kernel.org/r/6696fa02ebc716aa344c27a571a2afaa25e5b4d4.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 4 ++--
 kernel/trace/tracing_map.h | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index dd6401dd145e..e0f172932eca 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -163,7 +163,7 @@ static int tracing_map_add_field(struct tracing_map *map,
  * tracing_map_update_sum() or reading it via tracing_map_read_sum().
  *
  * Return: The index identifying the field in the map and associated
- * tracing_map_elts.
+ * tracing_map_elts, or -EINVAL on error.
  */
 int tracing_map_add_sum_field(struct tracing_map *map)
 {
@@ -184,7 +184,7 @@ int tracing_map_add_sum_field(struct tracing_map *map)
  * the key referenced by this key field resides.
  *
  * Return: The index identifying the field in the map and associated
- * tracing_map_elts.
+ * tracing_map_elts, or -EINVAL on error.
  */
 int tracing_map_add_key_field(struct tracing_map *map,
 			      unsigned int offset,
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 1f7eda548787..618838f5f30a 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -5,9 +5,10 @@
 #define TRACING_MAP_BITS_MAX		17
 #define TRACING_MAP_BITS_MIN		7
 
-#define TRACING_MAP_FIELDS_MAX		4
 #define TRACING_MAP_KEYS_MAX		2
-
+#define TRACING_MAP_VALS_MAX		3
+#define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
+					 TRACING_MAP_VALS_MAX)
 #define TRACING_MAP_SORT_KEYS_MAX	2
 
 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
-- 
cgit v1.2.3


From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:42 -0600
Subject: tracing: Add 'hist' event trigger command

'hist' triggers allow users to continually aggregate trace events,
which can then be viewed afterwards by simply reading a 'hist' file
containing the aggregation in a human-readable format.

The basic idea is very simple and boils down to a mechanism whereby
trace events, rather than being exhaustively dumped in raw form and
viewed directly, are automatically 'compressed' into meaningful tables
completely defined by the user.

This is done strictly via single-line command-line commands and
without the aid of any kind of programming language or interpreter.

A surprising number of typical use cases can be accomplished by users
via this simple mechanism.  In fact, a large number of the tasks that
users typically do using the more complicated script-based tracing
tools, at least during the initial stages of an investigation, can be
accomplished by simply specifying a set of keys and values to be used
in the creation of a hash table.

The Linux kernel trace event subsystem happens to provide an extensive
list of keys and values ready-made for such a purpose in the form of
the event format files associated with each trace event.  By simply
consulting the format file for field names of interest and by plugging
them into the hist trigger command, users can create an endless number
of useful aggregations to help with investigating various properties
of the system.  See Documentation/trace/events.txt for examples.

hist triggers are implemented on top of the existing event trigger
infrastructure, and as such are consistent with the existing triggers
from a user's perspective as well.

The basic syntax follows the existing trigger syntax.  Users start an
aggregation by writing a 'hist' trigger to the event of interest's
trigger file:

  # echo hist:keys=xxx [ if filter] > event/trigger

Once a hist trigger has been set up, by default it continually
aggregates every matching event into a hash table using the event key
and a value field named 'hitcount'.

To view the aggregation at any point in time, simply read the 'hist'
file in the same directory as the 'trigger' file:

  # cat event/hist

The detailed syntax provides additional options for user control, and
is described exhaustively in Documentation/trace/events.txt and in the
virtual tracing/README file in the tracing subsystem.

Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig                |  16 +
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.c                |  17 +
 kernel/trace/trace.h                |   7 +
 kernel/trace/trace_events.c         |   4 +
 kernel/trace/trace_events_hist.c    | 849 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |   1 +
 7 files changed, 895 insertions(+)
 create mode 100644 kernel/trace/trace_events_hist.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d39556fd863a..fafeaf803bd0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -538,6 +538,22 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
+config HIST_TRIGGERS
+	bool "Histogram triggers"
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select TRACING_MAP
+	default n
+	help
+	  Hist triggers allow one or more arbitrary trace event fields
+	  to be aggregated into hash tables and dumped to stdout by
+	  reading a debugfs/tracefs file.  They're useful for
+	  gathering quick and dirty (though precise) summaries of
+	  event activity as an initial guide for further investigation
+	  using more advanced tools.
+
+	  See Documentation/trace/events.txt.
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4255c4057aaa..979e7bfbde7a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0d12dbde8399..6cf8fd03b028 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3812,6 +3812,9 @@ static const char readme_msg[] =
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
 	"\t\t    snapshot\n"
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t\t    hist (see below)\n"
 #endif
 	"\t   example: echo traceoff > events/block/block_unplug/trigger\n"
 	"\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
@@ -3828,6 +3831,20 @@ static const char readme_msg[] =
 	"\t   To remove a trigger with a count:\n"
 	"\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
 	"\t   Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
+	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:size=#entries]\n"
+	"\t            [if <filter>]\n\n"
+	"\t    When a matching event is hit, an entry is added to a hash\n"
+	"\t    table using the key named, and the value of a sum called\n"
+	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
+	"\t    event's format description.  Keys can be any field.  The\n"
+	"\t    'size' parameter can be  used to specify more or fewer than\n"
+	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    Reading the 'hist' file for the event will dump the hash\n"
+	"\t    table in its entirety to stdout."
+#endif
 ;
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2525042760e6..505f8a45f426 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1162,6 +1162,13 @@ extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_hist_fops;
+
+#ifdef CONFIG_HIST_TRIGGERS
+extern int register_trigger_hist_cmd(void);
+#else
+static inline int register_trigger_hist_cmd(void) { return 0; }
+#endif
 
 extern int register_trigger_cmds(void);
 extern void clear_event_triggers(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index add81dff7520..e7cb983ee93c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	trace_create_file("trigger", 0644, file->dir, file,
 			  &event_trigger_fops);
 
+#ifdef CONFIG_HIST_TRIGGERS
+	trace_create_file("hist", 0444, file->dir, file,
+			  &event_hist_fops);
+#endif
 	trace_create_file("format", 0444, file->dir, call,
 			  &ftrace_event_format_fops);
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
new file mode 100644
index 000000000000..23b45e462117
--- /dev/null
+++ b/kernel/trace/trace_events_hist.c
@@ -0,0 +1,849 @@
+/*
+ * trace_events_hist - trace event hist triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+struct hist_field;
+
+typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+
+struct hist_field {
+	struct ftrace_event_field	*field;
+	unsigned long			flags;
+	hist_field_fn_t			fn;
+	unsigned int			size;
+};
+
+static u64 hist_field_counter(struct hist_field *field, void *event)
+{
+	return 1;
+}
+
+static u64 hist_field_string(struct hist_field *hist_field, void *event)
+{
+	char *addr = (char *)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)addr;
+}
+
+#define DEFINE_HIST_FIELD_FN(type)					\
+static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+{									\
+	type *addr = (type *)(event + hist_field->field->offset);	\
+									\
+	return (u64)*addr;						\
+}
+
+DEFINE_HIST_FIELD_FN(s64);
+DEFINE_HIST_FIELD_FN(u64);
+DEFINE_HIST_FIELD_FN(s32);
+DEFINE_HIST_FIELD_FN(u32);
+DEFINE_HIST_FIELD_FN(s16);
+DEFINE_HIST_FIELD_FN(u16);
+DEFINE_HIST_FIELD_FN(s8);
+DEFINE_HIST_FIELD_FN(u8);
+
+#define for_each_hist_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_fields; (i)++)
+
+#define for_each_hist_val_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_vals; (i)++)
+
+#define for_each_hist_key_field(i, hist_data)	\
+	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
+
+#define HITCOUNT_IDX		0
+#define HIST_KEY_MAX		1
+#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+
+enum hist_field_flags {
+	HIST_FIELD_FL_HITCOUNT	= 1,
+	HIST_FIELD_FL_KEY	= 2,
+	HIST_FIELD_FL_STRING	= 4,
+};
+
+struct hist_trigger_attrs {
+	char		*keys_str;
+	unsigned int	map_bits;
+};
+
+struct hist_trigger_data {
+	struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_vals;
+	unsigned int			n_keys;
+	unsigned int			n_fields;
+	unsigned int			key_size;
+	struct tracing_map_sort_key	sort_keys[TRACING_MAP_SORT_KEYS_MAX];
+	unsigned int			n_sort_keys;
+	struct trace_event_file		*event_file;
+	struct hist_trigger_attrs	*attrs;
+	struct tracing_map		*map;
+};
+
+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+{
+	hist_field_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = hist_field_s64;
+		else
+			fn = hist_field_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = hist_field_s32;
+		else
+			fn = hist_field_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = hist_field_s16;
+		else
+			fn = hist_field_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = hist_field_s8;
+		else
+			fn = hist_field_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int parse_map_size(char *str)
+{
+	unsigned long size, map_bits;
+	int ret;
+
+	strsep(&str, "=");
+	if (!str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = kstrtoul(str, 0, &size);
+	if (ret)
+		goto out;
+
+	map_bits = ilog2(roundup_pow_of_two(size));
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		ret = -EINVAL;
+	else
+		ret = map_bits;
+ out:
+	return ret;
+}
+
+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
+{
+	if (!attrs)
+		return;
+
+	kfree(attrs->keys_str);
+	kfree(attrs);
+}
+
+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+{
+	struct hist_trigger_attrs *attrs;
+	int ret = 0;
+
+	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return ERR_PTR(-ENOMEM);
+
+	while (trigger_str) {
+		char *str = strsep(&trigger_str, ":");
+
+		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+		    (strncmp(str, "keys=", strlen("keys=")) == 0))
+			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "size=", strlen("size=")) == 0) {
+			int map_bits = parse_map_size(str);
+
+			if (map_bits < 0) {
+				ret = map_bits;
+				goto free;
+			}
+			attrs->map_bits = map_bits;
+		} else {
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	if (!attrs->keys_str) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	return attrs;
+ free:
+	destroy_hist_trigger_attrs(attrs);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_hist_field(struct hist_field *hist_field)
+{
+	kfree(hist_field);
+}
+
+static struct hist_field *create_hist_field(struct ftrace_event_field *field,
+					    unsigned long flags)
+{
+	struct hist_field *hist_field;
+
+	if (field && is_function_field(field))
+		return NULL;
+
+	hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+	if (!hist_field)
+		return NULL;
+
+	if (flags & HIST_FIELD_FL_HITCOUNT) {
+		hist_field->fn = hist_field_counter;
+		goto out;
+	}
+
+	if (is_string_field(field)) {
+		flags |= HIST_FIELD_FL_STRING;
+		hist_field->fn = hist_field_string;
+	} else {
+		hist_field->fn = select_value_fn(field->size,
+						 field->is_signed);
+		if (!hist_field->fn) {
+			destroy_hist_field(hist_field);
+			return NULL;
+		}
+	}
+ out:
+	hist_field->field = field;
+	hist_field->flags = flags;
+
+	return hist_field;
+}
+
+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+		if (hist_data->fields[i]) {
+			destroy_hist_field(hist_data->fields[i]);
+			hist_data->fields[i] = NULL;
+		}
+	}
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+	hist_data->fields[HITCOUNT_IDX] =
+		create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
+	if (!hist_data->fields[HITCOUNT_IDX])
+		return -ENOMEM;
+
+	hist_data->n_vals++;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int create_val_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_hitcount_val(hist_data);
+
+	return ret;
+}
+
+static int create_key_field(struct hist_trigger_data *hist_data,
+			    unsigned int key_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	unsigned int key_size;
+	int ret = 0;
+
+	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+		return -EINVAL;
+
+	flags |= HIST_FIELD_FL_KEY;
+
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key_size = field->size;
+
+	hist_data->fields[key_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[key_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key_size = ALIGN(key_size, sizeof(u64));
+	hist_data->fields[key_idx]->size = key_size;
+	hist_data->key_size = key_size;
+	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->n_keys++;
+
+	if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
+		return -EINVAL;
+
+	ret = key_size;
+ out:
+	return ret;
+}
+
+static int create_key_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	unsigned int i, n_vals = hist_data->n_vals;
+	char *fields_str, *field_str;
+	int ret = -EINVAL;
+
+	fields_str = hist_data->attrs->keys_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		ret = create_key_field(hist_data, i, file, field_str);
+		if (ret < 0)
+			goto out;
+	}
+	if (fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+static int create_hist_fields(struct hist_trigger_data *hist_data,
+			      struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_val_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	ret = create_key_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ out:
+	return ret;
+}
+
+static int create_sort_keys(struct hist_trigger_data *hist_data)
+{
+	int ret = 0;
+
+	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+
+	return ret;
+}
+
+static void destroy_hist_data(struct hist_trigger_data *hist_data)
+{
+	destroy_hist_trigger_attrs(hist_data->attrs);
+	destroy_hist_fields(hist_data);
+	tracing_map_destroy(hist_data->map);
+	kfree(hist_data);
+}
+
+static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
+{
+	struct tracing_map *map = hist_data->map;
+	struct ftrace_event_field *field;
+	struct hist_field *hist_field;
+	unsigned int i, idx;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field->flags & HIST_FIELD_FL_KEY) {
+			tracing_map_cmp_fn_t cmp_fn;
+
+			field = hist_field->field;
+
+			if (is_string_field(field))
+				cmp_fn = tracing_map_cmp_string;
+			else
+				cmp_fn = tracing_map_cmp_num(field->size,
+							     field->is_signed);
+			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+		} else
+			idx = tracing_map_add_sum_field(map);
+
+		if (idx < 0)
+			return idx;
+	}
+
+	return 0;
+}
+
+static struct hist_trigger_data *
+create_hist_data(unsigned int map_bits,
+		 struct hist_trigger_attrs *attrs,
+		 struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	int ret = 0;
+
+	hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+	if (!hist_data)
+		return ERR_PTR(-ENOMEM);
+
+	hist_data->attrs = attrs;
+
+	ret = create_hist_fields(hist_data, file);
+	if (ret)
+		goto free;
+
+	ret = create_sort_keys(hist_data);
+	if (ret)
+		goto free;
+
+	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
+					    NULL, hist_data);
+	if (IS_ERR(hist_data->map)) {
+		ret = PTR_ERR(hist_data->map);
+		hist_data->map = NULL;
+		goto free;
+	}
+
+	ret = create_tracing_map_fields(hist_data);
+	if (ret)
+		goto free;
+
+	ret = tracing_map_init(hist_data->map);
+	if (ret)
+		goto free;
+
+	hist_data->event_file = file;
+ out:
+	return hist_data;
+ free:
+	hist_data->attrs = NULL;
+
+	destroy_hist_data(hist_data);
+
+	hist_data = ERR_PTR(ret);
+
+	goto out;
+}
+
+static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
+				    struct tracing_map_elt *elt,
+				    void *rec)
+{
+	struct hist_field *hist_field;
+	unsigned int i;
+	u64 hist_val;
+
+	for_each_hist_val_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		hist_val = hist_field->fn(hist_field, rec);
+		tracing_map_update_sum(elt, i, hist_val);
+	}
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	struct tracing_map_elt *elt;
+	u64 field_contents;
+	void *key = NULL;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		field_contents = key_field->fn(key_field, rec);
+		if (key_field->flags & HIST_FIELD_FL_STRING)
+			key = (void *)(unsigned long)field_contents;
+		else
+			key = (void *)&field_contents;
+	}
+
+	elt = tracing_map_insert(hist_data->map, key);
+	if (elt)
+		hist_trigger_elt_update(hist_data, elt, rec);
+}
+
+static void
+hist_trigger_entry_print(struct seq_file *m,
+			 struct hist_trigger_data *hist_data, void *key,
+			 struct tracing_map_elt *elt)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+	u64 uval;
+
+	seq_puts(m, "{ ");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ", ");
+
+		if (key_field->flags & HIST_FIELD_FL_STRING) {
+			seq_printf(m, "%s: %-50s", key_field->field->name,
+				   (char *)key);
+		} else {
+			uval = *(u64 *)key;
+			seq_printf(m, "%s: %10llu",
+				   key_field->field->name, uval);
+		}
+	}
+
+	seq_puts(m, " }");
+
+	seq_printf(m, " hitcount: %10llu",
+		   tracing_map_read_sum(elt, HITCOUNT_IDX));
+
+	seq_puts(m, "\n");
+}
+
+static int print_entries(struct seq_file *m,
+			 struct hist_trigger_data *hist_data)
+{
+	struct tracing_map_sort_entry **sort_entries = NULL;
+	struct tracing_map *map = hist_data->map;
+	unsigned int i, n_entries;
+
+	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
+					     hist_data->n_sort_keys,
+					     &sort_entries);
+	if (n_entries < 0)
+		return n_entries;
+
+	for (i = 0; i < n_entries; i++)
+		hist_trigger_entry_print(m, hist_data,
+					 sort_entries[i]->key,
+					 sort_entries[i]->elt);
+
+	tracing_map_destroy_sort_entries(sort_entries, n_entries);
+
+	return n_entries;
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *test, *data = NULL;
+	struct trace_event_file *event_file;
+	struct hist_trigger_data *hist_data;
+	int n_entries, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(test, &event_file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			data = test;
+			break;
+		}
+	}
+	if (!data)
+		goto out_unlock;
+
+	seq_puts(m, "# event histogram\n#\n# trigger info: ");
+	data->ops->print(m, data->ops, data);
+	seq_puts(m, "\n");
+
+	hist_data = data->private_data;
+	n_entries = print_entries(m, hist_data);
+	if (n_entries < 0) {
+		ret = n_entries;
+		n_entries = 0;
+	}
+
+	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
+		   (u64)atomic64_read(&hist_data->map->hits),
+		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int event_hist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hist_show, file);
+}
+
+const struct file_operations event_hist_fops = {
+	.open = event_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
+{
+	seq_printf(m, "%s", hist_field->field->name);
+}
+
+static int event_hist_trigger_print(struct seq_file *m,
+				    struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	seq_puts(m, "hist:keys=");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ",");
+
+		hist_field_print(m, key_field);
+	}
+
+	seq_puts(m, ":vals=");
+	seq_puts(m, "hitcount");
+
+	seq_puts(m, ":sort=");
+	seq_puts(m, "hitcount");
+
+	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+
+	if (data->filter_str)
+		seq_printf(m, " if %s", data->filter_str);
+
+	seq_puts(m, " [active]");
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void event_hist_trigger_free(struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	data->ref--;
+	if (!data->ref) {
+		trigger_data_free(data);
+		destroy_hist_data(hist_data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_hist_trigger_free,
+};
+
+static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+							    char *param)
+{
+	return &event_hist_trigger_ops;
+}
+
+static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+				 struct event_trigger_data *data,
+				 struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	int ret = 0;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (data->ops->init) {
+		ret = data->ops->init(data->ops, data);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_add_rcu(&data->list, &file->triggers);
+	ret++;
+
+	update_cond_flag(file);
+	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+		list_del_rcu(&data->list);
+		update_cond_flag(file);
+		ret--;
+	}
+ out:
+	return ret;
+}
+
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+				   struct trace_event_file *file,
+				   char *glob, char *cmd, char *param)
+{
+	unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
+	struct event_trigger_data *trigger_data;
+	struct hist_trigger_attrs *attrs;
+	struct event_trigger_ops *trigger_ops;
+	struct hist_trigger_data *hist_data;
+	char *trigger;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	/* separate the trigger from the filter (k:v [if filter]) */
+	trigger = strsep(&param, " \t");
+	if (!trigger)
+		return -EINVAL;
+
+	attrs = parse_hist_trigger_attrs(trigger);
+	if (IS_ERR(attrs))
+		return PTR_ERR(attrs);
+
+	if (attrs->map_bits)
+		hist_trigger_bits = attrs->map_bits;
+
+	hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+	if (IS_ERR(hist_data)) {
+		destroy_hist_trigger_attrs(attrs);
+		return PTR_ERR(hist_data);
+	}
+
+	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+	ret = -ENOMEM;
+	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	if (!trigger_data)
+		goto out_free;
+
+	trigger_data->count = -1;
+	trigger_data->ops = trigger_ops;
+	trigger_data->cmd_ops = cmd_ops;
+
+	INIT_LIST_HEAD(&trigger_data->list);
+	RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+	trigger_data->private_data = hist_data;
+
+	if (glob[0] == '!') {
+		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+		ret = 0;
+		goto out_free;
+	}
+
+	if (!param) /* if param is non-empty, it's supposed to be a filter */
+		goto out_reg;
+
+	if (!cmd_ops->set_filter)
+		goto out_reg;
+
+	ret = cmd_ops->set_filter(param, trigger_data, file);
+	if (ret < 0)
+		goto out_free;
+ out_reg:
+	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+	/*
+	 * The above returns on success the # of triggers registered,
+	 * but if it didn't register any it returns zero.  Consider no
+	 * triggers registered a failure too.
+	 */
+	if (!ret) {
+		ret = -ENOENT;
+		goto out_free;
+	} else if (ret < 0)
+		goto out_free;
+	/* Just return zero, not the number of registered triggers */
+	ret = 0;
+ out:
+	return ret;
+ out_free:
+	if (cmd_ops->set_filter)
+		cmd_ops->set_filter(NULL, trigger_data, NULL);
+
+	kfree(trigger_data);
+
+	destroy_hist_data(hist_data);
+	goto out;
+}
+
+static struct event_command trigger_hist_cmd = {
+	.name			= "hist",
+	.trigger_type		= ETT_EVENT_HIST,
+	.flags			= EVENT_CMD_FL_NEEDS_REC,
+	.func			= event_hist_trigger_func,
+	.reg			= hist_register_trigger,
+	.unreg			= unregister_trigger,
+	.get_trigger_ops	= event_hist_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+__init int register_trigger_hist_cmd(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_cmd);
+	WARN_ON(ret < 0);
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d67992f3bb0e..d29092afe005 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_cmd();
 
 	return 0;
 }
-- 
cgit v1.2.3


From f2606835d70d2a2e6a134f01821da8149e124796 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:43 -0600
Subject: tracing: Add hist trigger support for multiple values ('vals=' param)

Allow users to specify trace event fields to use in aggregated sums
via a new 'vals=' keyword.  Before this addition, the only aggregated
sum supported was the implied value 'hitcount'.  With this addition,
'hitcount' is also supported as an explicit value field, as is any
numeric trace event field.

This expands the hist trigger syntax from this:

  # echo hist:keys=xxx [ if filter] > event/trigger

to this:

  # echo hist:keys=xxx:vals=yyy [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/2a5d1adb5ba6c65d7bb2148e379f2fed47f29a68.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 12 +++---
 kernel/trace/trace_events_hist.c | 79 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6cf8fd03b028..bb62f54c5480 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3834,14 +3834,16 @@ static const char readme_msg[] =
 #ifdef CONFIG_HIST_TRIGGERS
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
 	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
-	"\t    table using the key named, and the value of a sum called\n"
-	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
-	"\t    event's format description.  Keys can be any field.  The\n"
-	"\t    'size' parameter can be  used to specify more or fewer than\n"
-	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    table using the key(s) and value(s) named, and the value of a\n"
+	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
+	"\t    correspond to fields in the event's format description.  Keys\n"
+	"\t    can be any field.  Values must correspond to numeric fields.\n"
+	"\t    The 'size' parameter can be  used to specify more or fewer\n"
+	"\t    than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout."
 #endif
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 23b45e462117..1b33590b9c8f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -84,6 +84,7 @@ enum hist_field_flags {
 
 struct hist_trigger_attrs {
 	char		*keys_str;
+	char		*vals_str;
 	unsigned int	map_bits;
 };
 
@@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 		return;
 
 	kfree(attrs->keys_str);
+	kfree(attrs->vals_str);
 	kfree(attrs);
 }
 
@@ -183,6 +185,10 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
 		    (strncmp(str, "keys=", strlen("keys=")) == 0))
 			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+			 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+			 (strncmp(str, "values=", strlen("values=")) == 0))
+			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -276,13 +282,70 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data)
 	return 0;
 }
 
+static int create_val_field(struct hist_trigger_data *hist_data,
+			    unsigned int val_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	int ret = 0;
+
+	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->fields[val_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[val_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++hist_data->n_vals;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		ret = -EINVAL;
+ out:
+	return ret;
+}
+
 static int create_val_fields(struct hist_trigger_data *hist_data,
 			     struct trace_event_file *file)
 {
+	char *fields_str, *field_str;
+	unsigned int i, j;
 	int ret;
 
 	ret = create_hitcount_val(hist_data);
+	if (ret)
+		goto out;
 
+	fields_str = hist_data->attrs->vals_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
+		     j < TRACING_MAP_VALS_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		if (strcmp(field_str, "hitcount") == 0)
+			continue;
+		ret = create_val_field(hist_data, j++, file, field_str);
+		if (ret)
+			goto out;
+	}
+	if (fields_str && (strcmp(fields_str, "hitcount") != 0))
+		ret = -EINVAL;
+ out:
 	return ret;
 }
 
@@ -552,6 +615,12 @@ hist_trigger_entry_print(struct seq_file *m,
 	seq_printf(m, " hitcount: %10llu",
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
+	for (i = 1; i < hist_data->n_vals; i++) {
+		seq_printf(m, "  %s: %10llu",
+			   hist_data->fields[i]->field->name,
+			   tracing_map_read_sum(elt, i));
+	}
+
 	seq_puts(m, "\n");
 }
 
@@ -659,7 +728,15 @@ static int event_hist_trigger_print(struct seq_file *m,
 	}
 
 	seq_puts(m, ":vals=");
-	seq_puts(m, "hitcount");
+
+	for_each_hist_val_field(i, hist_data) {
+		if (i == HITCOUNT_IDX)
+			seq_puts(m, "hitcount");
+		else {
+			seq_puts(m, ",");
+			hist_field_print(m, hist_data->fields[i]);
+		}
+	}
 
 	seq_puts(m, ":sort=");
 	seq_puts(m, "hitcount");
-- 
cgit v1.2.3


From 76a3b0c8ac344e1d0f436160cbb59b670b086947 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:44 -0600
Subject: tracing: Add hist trigger support for compound keys

Allow users to specify multiple trace event fields to use in keys by
allowing multiple fields in the 'keys=' keyword.  With this addition,
any unique combination of any of the fields named in the 'keys'
keyword will result in a new entry being added to the hash table.

Link: http://lkml.kernel.org/r/0cfa24e6ac3b0dcece7737d94aa1f322ae3afc4b.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 10 ++++++----
 kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62f54c5480..7a4c4dcf0bfe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3833,7 +3833,7 @@ static const char readme_msg[] =
 	"\t   Filters can be ignored when removing a trigger.\n"
 #ifdef CONFIG_HIST_TRIGGERS
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
-	"\t    Format: hist:keys=<field1>\n"
+	"\t    Format: hist:keys=<field1[,field2,...]>\n"
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
@@ -3841,9 +3841,11 @@ static const char readme_msg[] =
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
 	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
 	"\t    correspond to fields in the event's format description.  Keys\n"
-	"\t    can be any field.  Values must correspond to numeric fields.\n"
-	"\t    The 'size' parameter can be  used to specify more or fewer\n"
-	"\t    than the default 2048 entries for the hashtable size.\n\n"
+	"\t    can be any field.  Compound keys consisting of up to two\n"
+	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
+	"\t    correspond to numeric fields.  The 'size' parameter can be\n"
+	"\t    used to specify more or fewer than the default 2048 entries\n"
+	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout."
 #endif
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1b33590b9c8f..65fdfc6cb633 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -32,6 +32,7 @@ struct hist_field {
 	unsigned long			flags;
 	hist_field_fn_t			fn;
 	unsigned int			size;
+	unsigned int			offset;
 };
 
 static u64 hist_field_counter(struct hist_field *field, void *event)
@@ -73,8 +74,7 @@ DEFINE_HIST_FIELD_FN(u8);
 	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
 
 #define HITCOUNT_IDX		0
-#define HIST_KEY_MAX		1
-#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
 
 enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT	= 1,
@@ -351,6 +351,7 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
 
 static int create_key_field(struct hist_trigger_data *hist_data,
 			    unsigned int key_idx,
+			    unsigned int key_offset,
 			    struct trace_event_file *file,
 			    char *field_str)
 {
@@ -380,7 +381,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	key_size = ALIGN(key_size, sizeof(u64));
 	hist_data->fields[key_idx]->size = key_size;
-	hist_data->key_size = key_size;
+	hist_data->fields[key_idx]->offset = key_offset;
+	hist_data->key_size += key_size;
 	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
 		ret = -EINVAL;
 		goto out;
@@ -399,7 +401,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 static int create_key_fields(struct hist_trigger_data *hist_data,
 			     struct trace_event_file *file)
 {
-	unsigned int i, n_vals = hist_data->n_vals;
+	unsigned int i, key_offset = 0, n_vals = hist_data->n_vals;
 	char *fields_str, *field_str;
 	int ret = -EINVAL;
 
@@ -411,13 +413,15 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
 	if (!fields_str)
 		goto out;
 
-	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+	for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
 		field_str = strsep(&fields_str, ",");
 		if (!field_str)
 			break;
-		ret = create_key_field(hist_data, i, file, field_str);
+		ret = create_key_field(hist_data, i, key_offset,
+				       file, field_str);
 		if (ret < 0)
 			goto out;
+		key_offset += ret;
 	}
 	if (fields_str) {
 		ret = -EINVAL;
@@ -482,7 +486,10 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 			else
 				cmp_fn = tracing_map_cmp_num(field->size,
 							     field->is_signed);
-			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+			idx = tracing_map_add_key_field(map,
+							hist_field->offset,
+							cmp_fn);
+
 		} else
 			idx = tracing_map_add_sum_field(map);
 
@@ -562,12 +569,16 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	char compound_key[HIST_KEY_SIZE_MAX];
 	struct hist_field *key_field;
 	struct tracing_map_elt *elt;
 	u64 field_contents;
 	void *key = NULL;
 	unsigned int i;
 
+	if (hist_data->n_keys > 1)
+		memset(compound_key, 0, hist_data->key_size);
+
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 
@@ -576,8 +587,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 			key = (void *)(unsigned long)field_contents;
 		else
 			key = (void *)&field_contents;
+
+		if (hist_data->n_keys > 1) {
+			memcpy(compound_key + key_field->offset, key,
+			       key_field->size);
+		}
 	}
 
+	if (hist_data->n_keys > 1)
+		key = compound_key;
+
 	elt = tracing_map_insert(hist_data->map, key);
 	if (elt)
 		hist_trigger_elt_update(hist_data, elt, rec);
@@ -602,11 +621,11 @@ hist_trigger_entry_print(struct seq_file *m,
 
 		if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
-				   (char *)key);
+				   (char *)(key + key_field->offset));
 		} else {
-			uval = *(u64 *)key;
-			seq_printf(m, "%s: %10llu",
-				   key_field->field->name, uval);
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %10llu", key_field->field->name,
+				   uval);
 		}
 	}
 
-- 
cgit v1.2.3


From e62347d2453474fa514fbbbc4636313d34d3c850 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:45 -0600
Subject: tracing: Add hist trigger support for user-defined sorting ('sort='
 param)

Allow users to specify keys and/or values to sort on.  With this
addition, keys and values specified using the 'keys=' and 'vals='
keywords can be used to sort the hist trigger output via a new 'sort='
keyword.  If multiple sort keys are specified, the output will be
sorted using the second key as a secondary sort key, etc.  The default
sort order is ascending; if the user wants a different sort order,
'.descending' can be appended to the specific sort key.  Before this
addition, output was always sorted by 'hitcount' in ascending order.

This expands the hist trigger syntax from this:

    # echo hist:keys=xxx:vals=yyy \
          [ if filter] > event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \
          [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/b30a41db66ba486979c4f987aff5fab500ea53b3.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |   6 ++-
 kernel/trace/trace_events_hist.c | 112 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a4c4dcf0bfe..d16fa8e1cdbf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3835,6 +3835,7 @@ static const char readme_msg[] =
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
 	"\t    Format: hist:keys=<field1[,field2,...]>\n"
 	"\t            [:values=<field1[,field2,...]>]\n"
+	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
@@ -3843,7 +3844,10 @@ static const char readme_msg[] =
 	"\t    correspond to fields in the event's format description.  Keys\n"
 	"\t    can be any field.  Compound keys consisting of up to two\n"
 	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
-	"\t    correspond to numeric fields.  The 'size' parameter can be\n"
+	"\t    correspond to numeric fields.  Sort keys consisting of up to\n"
+	"\t    two fields can be specified using the 'sort' keyword.  The\n"
+	"\t    sort direction can be modified by appending '.descending' or\n"
+	"\t    '.ascending' to a sort field.  The 'size' parameter can be\n"
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 65fdfc6cb633..03ba4530c08c 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -85,6 +85,7 @@ enum hist_field_flags {
 struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
+	char		*sort_key_str;
 	unsigned int	map_bits;
 };
 
@@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 	if (!attrs)
 		return;
 
+	kfree(attrs->sort_key_str);
 	kfree(attrs->keys_str);
 	kfree(attrs->vals_str);
 	kfree(attrs);
@@ -189,6 +191,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
 			 (strncmp(str, "values=", strlen("values=")) == 0))
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
+			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -450,12 +454,92 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
 	return ret;
 }
 
+static int is_descending(const char *str)
+{
+	if (!str)
+		return 0;
+
+	if (strcmp(str, "descending") == 0)
+		return 1;
+
+	if (strcmp(str, "ascending") == 0)
+		return 0;
+
+	return -EINVAL;
+}
+
 static int create_sort_keys(struct hist_trigger_data *hist_data)
 {
-	int ret = 0;
+	char *fields_str = hist_data->attrs->sort_key_str;
+	struct ftrace_event_field *field = NULL;
+	struct tracing_map_sort_key *sort_key;
+	int descending, ret = 0;
+	unsigned int i, j;
+
+	hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
+
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+		char *field_str, *field_name;
+
+		sort_key = &hist_data->sort_keys[i];
+
+		field_str = strsep(&fields_str, ",");
+		if (!field_str) {
+			if (i == 0)
+				ret = -EINVAL;
+			break;
+		}
+
+		if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) {
+			ret = -EINVAL;
+			break;
+		}
 
-	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+		field_name = strsep(&field_str, ".");
+		if (!field_name) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (strcmp(field_name, "hitcount") == 0) {
+			descending = is_descending(field_str);
+			if (descending < 0) {
+				ret = descending;
+				break;
+			}
+			sort_key->descending = descending;
+			continue;
+		}
 
+		for (j = 1; j < hist_data->n_fields; j++) {
+			field = hist_data->fields[j]->field;
+			if (field && (strcmp(field_name, field->name) == 0)) {
+				sort_key->field_idx = j;
+				descending = is_descending(field_str);
+				if (descending < 0) {
+					ret = descending;
+					goto out;
+				}
+				sort_key->descending = descending;
+				break;
+			}
+		}
+		if (j == hist_data->n_fields) {
+			ret = -EINVAL;
+			break;
+		}
+	}
+	hist_data->n_sort_keys = i;
+ out:
 	return ret;
 }
 
@@ -758,7 +842,29 @@ static int event_hist_trigger_print(struct seq_file *m,
 	}
 
 	seq_puts(m, ":sort=");
-	seq_puts(m, "hitcount");
+
+	for (i = 0; i < hist_data->n_sort_keys; i++) {
+		struct tracing_map_sort_key *sort_key;
+
+		sort_key = &hist_data->sort_keys[i];
+
+		if (i > 0)
+			seq_puts(m, ",");
+
+		if (sort_key->field_idx == HITCOUNT_IDX)
+			seq_puts(m, "hitcount");
+		else {
+			unsigned int idx = sort_key->field_idx;
+
+			if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
+				return -EINVAL;
+
+			hist_field_print(m, hist_data->fields[idx]);
+		}
+
+		if (sort_key->descending)
+			seq_puts(m, ".descending");
+	}
 
 	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
 
-- 
cgit v1.2.3


From 83e99914c9e2677d8a80f2a23eca0d215d5bfb0f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:46 -0600
Subject: tracing: Add hist trigger support for pausing and continuing a trace

Allow users to append 'pause' or 'continue' to an existing trigger in
order to have it paused or to have a paused trace continue.

This expands the hist trigger syntax from this:
    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \
          [ if filter] >> event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause or cont \
          [ if filter] >> event/trigger

Link: http://lkml.kernel.org/r/b672a92c14702cb924cdf6fc27ea1809bed04907.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  7 ++++++-
 kernel/trace/trace_events_hist.c | 31 ++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d16fa8e1cdbf..7d5c63dd410a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3837,6 +3837,7 @@ static const char readme_msg[] =
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
+	"\t            [:pause][:continue]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3851,7 +3852,11 @@ static const char readme_msg[] =
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout."
+	"\t    table in its entirety to stdout.\n\n"
+	"\t    The 'pause' parameter can be used to pause an existing hist\n"
+	"\t    trigger or to start a hist trigger but not log any events\n"
+	"\t    until told to do so.  'continue' can be used to start or\n"
+	"\t    restart a paused hist trigger.\n\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 03ba4530c08c..e3fa9c89f125 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -86,6 +86,8 @@ struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
 	char		*sort_key_str;
+	bool		pause;
+	bool		cont;
 	unsigned int	map_bits;
 };
 
@@ -193,6 +195,11 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
 			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+		else if (strcmp(str, "pause") == 0)
+			attrs->pause = true;
+		else if ((strcmp(str, "cont") == 0) ||
+			 (strcmp(str, "continue") == 0))
+			attrs->cont = true;
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -871,7 +878,10 @@ static int event_hist_trigger_print(struct seq_file *m,
 	if (data->filter_str)
 		seq_printf(m, " if %s", data->filter_str);
 
-	seq_puts(m, " [active]");
+	if (data->paused)
+		seq_puts(m, " [paused]");
+	else
+		seq_puts(m, " [active]");
 
 	seq_putc(m, '\n');
 
@@ -910,16 +920,30 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
 {
+	struct hist_trigger_data *hist_data = data->private_data;
 	struct event_trigger_data *test;
 	int ret = 0;
 
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			ret = -EEXIST;
+			if (hist_data->attrs->pause)
+				test->paused = true;
+			else if (hist_data->attrs->cont)
+				test->paused = false;
+			else
+				ret = -EEXIST;
 			goto out;
 		}
 	}
 
+	if (hist_data->attrs->cont) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (hist_data->attrs->pause)
+		data->paused = true;
+
 	if (data->ops->init) {
 		ret = data->ops->init(data->ops, data);
 		if (ret < 0)
@@ -1011,7 +1035,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	 * triggers registered a failure too.
 	 */
 	if (!ret) {
-		ret = -ENOENT;
+		if (!(attrs->pause || attrs->cont))
+			ret = -ENOENT;
 		goto out_free;
 	} else if (ret < 0)
 		goto out_free;
-- 
cgit v1.2.3


From e86ae9baacfa9efe957df4fc037f079772636d76 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:47 -0600
Subject: tracing: Add hist trigger support for clearing a trace

Allow users to append 'clear' to an existing trigger in order to have
the hash table cleared.

This expands the hist trigger syntax from this:
    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont \
           [ if filter] >> event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont/clear \
          [ if filter] >> event/trigger

Link: http://lkml.kernel.org/r/ae15dd0d9b2f7af07a37c1ff682063e2dbcdf160.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  5 ++++-
 kernel/trace/trace_events_hist.c | 24 ++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7d5c63dd410a..4097cd3763f7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3837,7 +3837,7 @@ static const char readme_msg[] =
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
-	"\t            [:pause][:continue]\n"
+	"\t            [:pause][:continue][:clear]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3857,6 +3857,9 @@ static const char readme_msg[] =
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
 	"\t    restart a paused hist trigger.\n\n"
+	"\t    The 'clear' parameter will clear the contents of a running\n"
+	"\t    hist trigger and leave its current paused/active state\n"
+	"\t    unchanged.\n\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index e3fa9c89f125..83cb25e067ad 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -88,6 +88,7 @@ struct hist_trigger_attrs {
 	char		*sort_key_str;
 	bool		pause;
 	bool		cont;
+	bool		clear;
 	unsigned int	map_bits;
 };
 
@@ -200,6 +201,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		else if ((strcmp(str, "cont") == 0) ||
 			 (strcmp(str, "continue") == 0))
 			attrs->cont = true;
+		else if (strcmp(str, "clear") == 0)
+			attrs->clear = true;
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -916,6 +919,21 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 	return &event_hist_trigger_ops;
 }
 
+static void hist_clear(struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	bool paused;
+
+	paused = data->paused;
+	data->paused = true;
+
+	synchronize_sched();
+
+	tracing_map_clear(hist_data->map);
+
+	data->paused = paused;
+}
+
 static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
@@ -930,13 +948,15 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				test->paused = true;
 			else if (hist_data->attrs->cont)
 				test->paused = false;
+			else if (hist_data->attrs->clear)
+				hist_clear(test);
 			else
 				ret = -EEXIST;
 			goto out;
 		}
 	}
 
-	if (hist_data->attrs->cont) {
+	if (hist_data->attrs->cont || hist_data->attrs->clear) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1035,7 +1055,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	 * triggers registered a failure too.
 	 */
 	if (!ret) {
-		if (!(attrs->pause || attrs->cont))
+		if (!(attrs->pause || attrs->cont || attrs->clear))
 			ret = -ENOENT;
 		goto out_free;
 	} else if (ret < 0)
-- 
cgit v1.2.3


From 0c4a6b4666e8eb86dead3f09b40bb8ca4f614e4f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:48 -0600
Subject: tracing: Add hist trigger 'hex' modifier for displaying numeric
 fields

Allow users to have numeric fields displayed as hex values in the
output by appending '.hex' to field names:

   # echo hist:keys=aaa,bbb.hex:vals=ccc.hex ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/67bd431edda2af5798d7694818f7e8d71b6b3463.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  5 +++-
 kernel/trace/trace_events_hist.c | 62 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4097cd3763f7..2eb2eafbe7f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3852,7 +3852,10 @@ static const char readme_msg[] =
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout.\n\n"
+	"\t    table in its entirety to stdout.  The default format used to\n"
+	"\t    display a given field can be modified by appending any of the\n"
+	"\t    following modifiers to the field name, as applicable:\n\n"
+	"\t            .hex        display a number as a hex value\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 83cb25e067ad..8e49eeef8867 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -80,6 +80,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT	= 1,
 	HIST_FIELD_FL_KEY	= 2,
 	HIST_FIELD_FL_STRING	= 4,
+	HIST_FIELD_FL_HEX	= 8,
 };
 
 struct hist_trigger_attrs {
@@ -303,11 +304,23 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 {
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
+	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
 		return -EINVAL;
-	field = trace_find_event_field(file->event_call, field_str);
+
+	field_name = strsep(&field_str, ".");
+	if (field_str) {
+		if (strcmp(field_str, "hex") == 0)
+			flags |= HIST_FIELD_FL_HEX;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	field = trace_find_event_field(file->event_call, field_name);
 	if (!field) {
 		ret = -EINVAL;
 		goto out;
@@ -372,6 +385,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
 	unsigned int key_size;
+	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
@@ -379,7 +393,17 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	flags |= HIST_FIELD_FL_KEY;
 
-	field = trace_find_event_field(file->event_call, field_str);
+	field_name = strsep(&field_str, ".");
+	if (field_str) {
+		if (strcmp(field_str, "hex") == 0)
+			flags |= HIST_FIELD_FL_HEX;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	field = trace_find_event_field(file->event_call, field_name);
 	if (!field) {
 		ret = -EINVAL;
 		goto out;
@@ -713,7 +737,11 @@ hist_trigger_entry_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ", ");
 
-		if (key_field->flags & HIST_FIELD_FL_STRING) {
+		if (key_field->flags & HIST_FIELD_FL_HEX) {
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %llx",
+				   key_field->field->name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
 		} else {
@@ -729,9 +757,15 @@ hist_trigger_entry_print(struct seq_file *m,
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
 	for (i = 1; i < hist_data->n_vals; i++) {
-		seq_printf(m, "  %s: %10llu",
-			   hist_data->fields[i]->field->name,
-			   tracing_map_read_sum(elt, i));
+		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
+			seq_printf(m, "  %s: %10llx",
+				   hist_data->fields[i]->field->name,
+				   tracing_map_read_sum(elt, i));
+		} else {
+			seq_printf(m, "  %s: %10llu",
+				   hist_data->fields[i]->field->name,
+				   tracing_map_read_sum(elt, i));
+		}
 	}
 
 	seq_puts(m, "\n");
@@ -816,9 +850,25 @@ const struct file_operations event_hist_fops = {
 	.release = single_release,
 };
 
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+	const char *flags_str = NULL;
+
+	if (hist_field->flags & HIST_FIELD_FL_HEX)
+		flags_str = "hex";
+
+	return flags_str;
+}
+
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
 	seq_printf(m, "%s", hist_field->field->name);
+	if (hist_field->flags) {
+		const char *flags_str = get_hist_field_flags(hist_field);
+
+		if (flags_str)
+			seq_printf(m, ".%s", flags_str);
+	}
 }
 
 static int event_hist_trigger_print(struct seq_file *m,
-- 
cgit v1.2.3


From c6afad49d127f6d7c9957319f55173a2198b1ba8 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:49 -0600
Subject: tracing: Add hist trigger 'sym' and 'sym-offset' modifiers

Allow users to have address fields displayed as symbols in the output
by appending '.sym' or 'sym-offset' to field names:

   # echo hist:keys=aaa.sym,bbb.sym-offset ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/87d4935821491c0275513f0fbfb9bab8d3d3f079.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  4 +++-
 kernel/trace/trace_events_hist.c | 29 +++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2eb2eafbe7f5..34459eb0c844 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3855,7 +3855,9 @@ static const char readme_msg[] =
 	"\t    table in its entirety to stdout.  The default format used to\n"
 	"\t    display a given field can be modified by appending any of the\n"
 	"\t    following modifiers to the field name, as applicable:\n\n"
-	"\t            .hex        display a number as a hex value\n\n"
+	"\t            .hex        display a number as a hex value\n"
+	"\t            .sym        display an address as a symbol\n"
+	"\t            .sym-offset display an address as a symbol and offset\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 8e49eeef8867..808cc06cdb61 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -77,10 +77,12 @@ DEFINE_HIST_FIELD_FN(u8);
 #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
 
 enum hist_field_flags {
-	HIST_FIELD_FL_HITCOUNT	= 1,
-	HIST_FIELD_FL_KEY	= 2,
-	HIST_FIELD_FL_STRING	= 4,
-	HIST_FIELD_FL_HEX	= 8,
+	HIST_FIELD_FL_HITCOUNT		= 1,
+	HIST_FIELD_FL_KEY		= 2,
+	HIST_FIELD_FL_STRING		= 4,
+	HIST_FIELD_FL_HEX		= 8,
+	HIST_FIELD_FL_SYM		= 16,
+	HIST_FIELD_FL_SYM_OFFSET	= 32,
 };
 
 struct hist_trigger_attrs {
@@ -397,6 +399,10 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	if (field_str) {
 		if (strcmp(field_str, "hex") == 0)
 			flags |= HIST_FIELD_FL_HEX;
+		else if (strcmp(field_str, "sym") == 0)
+			flags |= HIST_FIELD_FL_SYM;
+		else if (strcmp(field_str, "sym-offset") == 0)
+			flags |= HIST_FIELD_FL_SYM_OFFSET;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -726,6 +732,7 @@ hist_trigger_entry_print(struct seq_file *m,
 			 struct tracing_map_elt *elt)
 {
 	struct hist_field *key_field;
+	char str[KSYM_SYMBOL_LEN];
 	unsigned int i;
 	u64 uval;
 
@@ -741,6 +748,16 @@ hist_trigger_entry_print(struct seq_file *m,
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %llx",
 				   key_field->field->name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
+			uval = *(u64 *)(key + key_field->offset);
+			sprint_symbol_no_offset(str, uval);
+			seq_printf(m, "%s: [%llx] %-45s",
+				   key_field->field->name, uval, str);
+		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
+			uval = *(u64 *)(key + key_field->offset);
+			sprint_symbol(str, uval);
+			seq_printf(m, "%s: [%llx] %-55s",
+				   key_field->field->name, uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -856,6 +873,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 
 	if (hist_field->flags & HIST_FIELD_FL_HEX)
 		flags_str = "hex";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM)
+		flags_str = "sym";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+		flags_str = "sym-offset";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 6b4827ad028a1ab2fc4dcf1f5e6e077018d1b770 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:50 -0600
Subject: tracing: Add hist trigger 'execname' modifier

Allow users to have common_pid field values displayed as program names
in the output by appending '.execname' to a common_pid field name:

   # echo hist:keys=common_pid.execname ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/e172e81f10f5b8d1f08450e3763c850f39fbf698.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |   3 +-
 kernel/trace/trace_events_hist.c | 100 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 34459eb0c844..58a8bee50c6e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3857,7 +3857,8 @@ static const char readme_msg[] =
 	"\t    following modifiers to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
-	"\t            .sym-offset display an address as a symbol and offset\n\n"
+	"\t            .sym-offset display an address as a symbol and offset\n"
+	"\t            .execname   display a common_pid as a program name\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 808cc06cdb61..227e6c21b1eb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -83,6 +83,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_HEX		= 8,
 	HIST_FIELD_FL_SYM		= 16,
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
+	HIST_FIELD_FL_EXECNAME		= 64,
 };
 
 struct hist_trigger_attrs {
@@ -232,6 +233,73 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 	return ERR_PTR(ret);
 }
 
+static inline void save_comm(char *comm, struct task_struct *task)
+{
+	if (!task->pid) {
+		strcpy(comm, "<idle>");
+		return;
+	}
+
+	if (WARN_ON_ONCE(task->pid < 0)) {
+		strcpy(comm, "<XXX>");
+		return;
+	}
+
+	memcpy(comm, task->comm, TASK_COMM_LEN);
+}
+
+static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+{
+	kfree((char *)elt->private_data);
+}
+
+static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+{
+	struct hist_trigger_data *hist_data = elt->map->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+			unsigned int size = TASK_COMM_LEN + 1;
+
+			elt->private_data = kzalloc(size, GFP_KERNEL);
+			if (!elt->private_data)
+				return -ENOMEM;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
+				       struct tracing_map_elt *from)
+{
+	char *comm_from = from->private_data;
+	char *comm_to = to->private_data;
+
+	if (comm_from)
+		memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+}
+
+static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+{
+	char *comm = elt->private_data;
+
+	if (comm)
+		save_comm(comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
+	.elt_alloc	= hist_trigger_elt_comm_alloc,
+	.elt_copy	= hist_trigger_elt_comm_copy,
+	.elt_free	= hist_trigger_elt_comm_free,
+	.elt_init	= hist_trigger_elt_comm_init,
+};
+
 static void destroy_hist_field(struct hist_field *hist_field)
 {
 	kfree(hist_field);
@@ -403,6 +471,9 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			flags |= HIST_FIELD_FL_SYM;
 		else if (strcmp(field_str, "sym-offset") == 0)
 			flags |= HIST_FIELD_FL_SYM_OFFSET;
+		else if ((strcmp(field_str, "execname") == 0) &&
+			 (strcmp(field_name, "common_pid") == 0))
+			flags |= HIST_FIELD_FL_EXECNAME;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -624,11 +695,27 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 	return 0;
 }
 
+static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (key_field->flags & HIST_FIELD_FL_EXECNAME)
+			return true;
+	}
+
+	return false;
+}
+
 static struct hist_trigger_data *
 create_hist_data(unsigned int map_bits,
 		 struct hist_trigger_attrs *attrs,
 		 struct trace_event_file *file)
 {
+	const struct tracing_map_ops *map_ops = NULL;
 	struct hist_trigger_data *hist_data;
 	int ret = 0;
 
@@ -646,8 +733,11 @@ create_hist_data(unsigned int map_bits,
 	if (ret)
 		goto free;
 
+	if (need_tracing_map_ops(hist_data))
+		map_ops = &hist_trigger_elt_comm_ops;
+
 	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
-					    NULL, hist_data);
+					    map_ops, hist_data);
 	if (IS_ERR(hist_data->map)) {
 		ret = PTR_ERR(hist_data->map);
 		hist_data->map = NULL;
@@ -758,6 +848,12 @@ hist_trigger_entry_print(struct seq_file *m,
 			sprint_symbol(str, uval);
 			seq_printf(m, "%s: [%llx] %-55s",
 				   key_field->field->name, uval, str);
+		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+			char *comm = elt->private_data;
+
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %-16s[%10llu]",
+				   key_field->field->name, comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -877,6 +973,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "sym";
 	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
 		flags_str = "sym-offset";
+	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+		flags_str = "execname";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 316961988b5ec71bbf4b2ad447662770349aec13 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:51 -0600
Subject: tracing: Add hist trigger 'syscall' modifier

Allow users to have syscall id fields displayed as syscall names in
the output by appending '.syscall' to field names:

   # echo hist:keys=aaa.syscall ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/2bab1e59933d76a14b545bd2e02f80b8b08ac4d3.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  3 ++-
 kernel/trace/trace_events_hist.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58a8bee50c6e..0dd23c5b1c34 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3858,7 +3858,8 @@ static const char readme_msg[] =
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
-	"\t            .execname   display a common_pid as a program name\n\n"
+	"\t            .execname   display a common_pid as a program name\n"
+	"\t            .syscall    display a syscall id as a syscall name\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 227e6c21b1eb..f9e7ecb33847 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -84,6 +84,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_SYM		= 16,
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
 	HIST_FIELD_FL_EXECNAME		= 64,
+	HIST_FIELD_FL_SYSCALL		= 128,
 };
 
 struct hist_trigger_attrs {
@@ -474,6 +475,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		else if ((strcmp(field_str, "execname") == 0) &&
 			 (strcmp(field_name, "common_pid") == 0))
 			flags |= HIST_FIELD_FL_EXECNAME;
+		else if (strcmp(field_str, "syscall") == 0)
+			flags |= HIST_FIELD_FL_SYSCALL;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -854,6 +857,16 @@ hist_trigger_entry_print(struct seq_file *m,
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %-16s[%10llu]",
 				   key_field->field->name, comm, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
+			const char *syscall_name;
+
+			uval = *(u64 *)(key + key_field->offset);
+			syscall_name = get_syscall_name(uval);
+			if (!syscall_name)
+				syscall_name = "unknown_syscall";
+
+			seq_printf(m, "%s: %-30s[%3llu]",
+				   key_field->field->name, syscall_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -975,6 +988,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "sym-offset";
 	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
 		flags_str = "execname";
+	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+		flags_str = "syscall";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 69a0200c2e25d61c50091549d00cfeb426c258f5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:52 -0600
Subject: tracing: Add hist trigger support for stacktraces as keys

It's often useful to be able to use a stacktrace as a hash key, for
keeping a count of the number of times a particular call path resulted
in a trace event, for instance.  Add a special key named 'stacktrace'
which can be used as key in a 'keys=' param for this purpose:

    # echo hist:keys=stacktrace ... \
               [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/87515e90b3785232a874a12156174635a348edb1.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  16 ++---
 kernel/trace/trace_events_hist.c | 135 +++++++++++++++++++++++++++++----------
 2 files changed, 109 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0dd23c5b1c34..2238bfde799b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3843,14 +3843,14 @@ static const char readme_msg[] =
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
 	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
 	"\t    correspond to fields in the event's format description.  Keys\n"
-	"\t    can be any field.  Compound keys consisting of up to two\n"
-	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
-	"\t    correspond to numeric fields.  Sort keys consisting of up to\n"
-	"\t    two fields can be specified using the 'sort' keyword.  The\n"
-	"\t    sort direction can be modified by appending '.descending' or\n"
-	"\t    '.ascending' to a sort field.  The 'size' parameter can be\n"
-	"\t    used to specify more or fewer than the default 2048 entries\n"
-	"\t    for the hashtable size.\n\n"
+	"\t    can be any field, or the special string 'stacktrace'.\n"
+	"\t    Compound keys consisting of up to two fields can be specified\n"
+	"\t    by the 'keys' keyword.  Values must correspond to numeric\n"
+	"\t    fields.  Sort keys consisting of up to two fields can be\n"
+	"\t    specified using the 'sort' keyword.  The sort direction can\n"
+	"\t    be modified by appending '.descending' or '.ascending' to a\n"
+	"\t    sort field.  The 'size' parameter can be used to specify more\n"
+	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout.  The default format used to\n"
 	"\t    display a given field can be modified by appending any of the\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f9e7ecb33847..21cae42b54da 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -35,6 +35,11 @@ struct hist_field {
 	unsigned int			offset;
 };
 
+static u64 hist_field_none(struct hist_field *field, void *event)
+{
+	return 0;
+}
+
 static u64 hist_field_counter(struct hist_field *field, void *event)
 {
 	return 1;
@@ -73,8 +78,12 @@ DEFINE_HIST_FIELD_FN(u8);
 #define for_each_hist_key_field(i, hist_data)	\
 	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
 
+#define HIST_STACKTRACE_DEPTH	16
+#define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP	5
+
 #define HITCOUNT_IDX		0
-#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
+#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
 
 enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT		= 1,
@@ -85,6 +94,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
 	HIST_FIELD_FL_EXECNAME		= 64,
 	HIST_FIELD_FL_SYSCALL		= 128,
+	HIST_FIELD_FL_STACKTRACE	= 256,
 };
 
 struct hist_trigger_attrs {
@@ -323,6 +333,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (flags & HIST_FIELD_FL_STACKTRACE) {
+		hist_field->fn = hist_field_none;
+		goto out;
+	}
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 		hist_field->fn = hist_field_string;
@@ -456,7 +471,6 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
 	unsigned int key_size;
-	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
@@ -464,33 +478,39 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	flags |= HIST_FIELD_FL_KEY;
 
-	field_name = strsep(&field_str, ".");
-	if (field_str) {
-		if (strcmp(field_str, "hex") == 0)
-			flags |= HIST_FIELD_FL_HEX;
-		else if (strcmp(field_str, "sym") == 0)
-			flags |= HIST_FIELD_FL_SYM;
-		else if (strcmp(field_str, "sym-offset") == 0)
-			flags |= HIST_FIELD_FL_SYM_OFFSET;
-		else if ((strcmp(field_str, "execname") == 0) &&
-			 (strcmp(field_name, "common_pid") == 0))
-			flags |= HIST_FIELD_FL_EXECNAME;
-		else if (strcmp(field_str, "syscall") == 0)
-			flags |= HIST_FIELD_FL_SYSCALL;
-		else {
+	if (strcmp(field_str, "stacktrace") == 0) {
+		flags |= HIST_FIELD_FL_STACKTRACE;
+		key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+	} else {
+		char *field_name = strsep(&field_str, ".");
+
+		if (field_str) {
+			if (strcmp(field_str, "hex") == 0)
+				flags |= HIST_FIELD_FL_HEX;
+			else if (strcmp(field_str, "sym") == 0)
+				flags |= HIST_FIELD_FL_SYM;
+			else if (strcmp(field_str, "sym-offset") == 0)
+				flags |= HIST_FIELD_FL_SYM_OFFSET;
+			else if ((strcmp(field_str, "execname") == 0) &&
+				 (strcmp(field_name, "common_pid") == 0))
+				flags |= HIST_FIELD_FL_EXECNAME;
+			else if (strcmp(field_str, "syscall") == 0)
+				flags |= HIST_FIELD_FL_SYSCALL;
+			else {
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+
+		field = trace_find_event_field(file->event_call, field_name);
+		if (!field) {
 			ret = -EINVAL;
 			goto out;
 		}
-	}
 
-	field = trace_find_event_field(file->event_call, field_name);
-	if (!field) {
-		ret = -EINVAL;
-		goto out;
+		key_size = field->size;
 	}
 
-	key_size = field->size;
-
 	hist_data->fields[key_idx] = create_hist_field(field, flags);
 	if (!hist_data->fields[key_idx]) {
 		ret = -ENOMEM;
@@ -679,7 +699,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 
 			field = hist_field->field;
 
-			if (is_string_field(field))
+			if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+				cmp_fn = tracing_map_cmp_none;
+			else if (is_string_field(field))
 				cmp_fn = tracing_map_cmp_string;
 			else
 				cmp_fn = tracing_map_cmp_num(field->size,
@@ -786,7 +808,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	unsigned long entries[HIST_STACKTRACE_DEPTH];
 	char compound_key[HIST_KEY_SIZE_MAX];
+	struct stack_trace stacktrace;
 	struct hist_field *key_field;
 	struct tracing_map_elt *elt;
 	u64 field_contents;
@@ -799,15 +823,27 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 
-		field_contents = key_field->fn(key_field, rec);
-		if (key_field->flags & HIST_FIELD_FL_STRING)
-			key = (void *)(unsigned long)field_contents;
-		else
-			key = (void *)&field_contents;
+		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+			stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
+			stacktrace.entries = entries;
+			stacktrace.nr_entries = 0;
+			stacktrace.skip = HIST_STACKTRACE_SKIP;
 
-		if (hist_data->n_keys > 1) {
-			memcpy(compound_key + key_field->offset, key,
-			       key_field->size);
+			memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
+			save_stack_trace(&stacktrace);
+
+			key = entries;
+		} else {
+			field_contents = key_field->fn(key_field, rec);
+			if (key_field->flags & HIST_FIELD_FL_STRING)
+				key = (void *)(unsigned long)field_contents;
+			else
+				key = (void *)&field_contents;
+
+			if (hist_data->n_keys > 1) {
+				memcpy(compound_key + key_field->offset, key,
+				       key_field->size);
+			}
 		}
 	}
 
@@ -819,6 +855,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 		hist_trigger_elt_update(hist_data, elt, rec);
 }
 
+static void hist_trigger_stacktrace_print(struct seq_file *m,
+					  unsigned long *stacktrace_entries,
+					  unsigned int max_entries)
+{
+	char str[KSYM_SYMBOL_LEN];
+	unsigned int spaces = 8;
+	unsigned int i;
+
+	for (i = 0; i < max_entries; i++) {
+		if (stacktrace_entries[i] == ULONG_MAX)
+			return;
+
+		seq_printf(m, "%*c", 1 + spaces, ' ');
+		sprint_symbol(str, stacktrace_entries[i]);
+		seq_printf(m, "%s\n", str);
+	}
+}
+
 static void
 hist_trigger_entry_print(struct seq_file *m,
 			 struct hist_trigger_data *hist_data, void *key,
@@ -826,6 +880,7 @@ hist_trigger_entry_print(struct seq_file *m,
 {
 	struct hist_field *key_field;
 	char str[KSYM_SYMBOL_LEN];
+	bool multiline = false;
 	unsigned int i;
 	u64 uval;
 
@@ -867,6 +922,12 @@ hist_trigger_entry_print(struct seq_file *m,
 
 			seq_printf(m, "%s: %-30s[%3llu]",
 				   key_field->field->name, syscall_name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+			seq_puts(m, "stacktrace:\n");
+			hist_trigger_stacktrace_print(m,
+						      key + key_field->offset,
+						      HIST_STACKTRACE_DEPTH);
+			multiline = true;
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -877,7 +938,10 @@ hist_trigger_entry_print(struct seq_file *m,
 		}
 	}
 
-	seq_puts(m, " }");
+	if (!multiline)
+		seq_puts(m, " ");
+
+	seq_puts(m, "}");
 
 	seq_printf(m, " hitcount: %10llu",
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
@@ -1021,7 +1085,10 @@ static int event_hist_trigger_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ",");
 
-		hist_field_print(m, key_field);
+		if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+			seq_puts(m, "stacktrace");
+		else
+			hist_field_print(m, key_field);
 	}
 
 	seq_puts(m, ":vals=");
-- 
cgit v1.2.3


From 79e577cbce4c4c2bf0d64ec315adb04eda40736b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 3 Mar 2016 12:54:53 -0600
Subject: tracing: Support string type key properly

The string in a trace event is usually recorded as dynamic array which
is variable length.  But current hist code only support fixed length
array so it cannot support most strings.

This patch fixes it by checking filter_type of the field and get
proper pointer with it.  With this, it can get a histogram of exec()
based on filenames like below:

  # cd /sys/kernel/tracing/events/sched/sched_process_exec
  # cat 'hist:key=filename' > trigger
  # ps
   PID TTY       TIME CMD
     1 ?     00:00:00 init
    29 ?     00:00:00 sh
    38 ?     00:00:00 ps
  # ls
  enable  filter  format  hist  id  trigger
  # cat hist
  # trigger info: hist:keys=filename:vals=hitcount:sort=hitcount:size=2048 [active]

  { filename: /usr/bin/ps                         } hitcount:          1
  { filename: /usr/bin/ls                         } hitcount:          1
  { filename: /usr/bin/cat                        } hitcount:          1

  Totals:
      Hits: 3
      Entries: 3
      Dropped: 0

Link: http://lkml.kernel.org/r/610180d6df0cfdf11ee205452f3b241dea657233.1457029949.git.tom.zanussi@linux.intel.com

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ Added (unsigned long) typecast to fix compile warning ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 21cae42b54da..418ff27cbbaf 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -52,12 +52,28 @@ static u64 hist_field_string(struct hist_field *hist_field, void *event)
 	return (u64)(unsigned long)addr;
 }
 
+static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+{
+	u32 str_item = *(u32 *)(event + hist_field->field->offset);
+	int str_loc = str_item & 0xffff;
+	char *addr = (char *)(event + str_loc);
+
+	return (u64)(unsigned long)addr;
+}
+
+static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+{
+	char **addr = (char **)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)*addr;
+}
+
 #define DEFINE_HIST_FIELD_FN(type)					\
 static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
 {									\
 	type *addr = (type *)(event + hist_field->field->offset);	\
 									\
-	return (u64)*addr;						\
+	return (u64)(unsigned long)*addr;				\
 }
 
 DEFINE_HIST_FIELD_FN(s64);
@@ -340,7 +356,13 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
-		hist_field->fn = hist_field_string;
+
+		if (field->filter_type == FILTER_STATIC_STRING)
+			hist_field->fn = hist_field_string;
+		else if (field->filter_type == FILTER_DYN_STRING)
+			hist_field->fn = hist_field_dynstring;
+		else
+			hist_field->fn = hist_field_pstring;
 	} else {
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
@@ -508,7 +530,10 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			goto out;
 		}
 
-		key_size = field->size;
+		if (is_string_field(field)) /* should be last key field */
+			key_size = HIST_KEY_SIZE_MAX - key_offset;
+		else
+			key_size = field->size;
 	}
 
 	hist_data->fields[key_idx] = create_hist_field(field, flags);
@@ -841,8 +866,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 				key = (void *)&field_contents;
 
 			if (hist_data->n_keys > 1) {
+				/* ensure NULL-termination */
+				size_t size = key_field->size - 1;
+
+				if (key_field->flags & HIST_FIELD_FL_STRING) {
+					struct ftrace_event_field *field;
+
+					field = key_field->field;
+					if (field->filter_type == FILTER_DYN_STRING)
+						size = *(u32 *)(rec + field->offset) >> 16;
+					else if (field->filter_type == FILTER_PTR_STRING)
+						size = strlen(key);
+
+					if (size > key_field->size - 1)
+						size = key_field->size - 1;
+				}
+
 				memcpy(compound_key + key_field->offset, key,
-				       key_field->size);
+				       size);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 6a475cb17fe161dbde62eca7e4c2cf59edff182f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:54 -0600
Subject: tracing: Remove restriction on string position in hist trigger keys

If we assume the maximum size for a string field, we don't have to
worry about its position.  Since we only allow two keys in a compound
key and having more than one string key in a given compound key
doesn't make much sense anyway, trading a bit of extra space instead
of introducing an arbitrary restriction makes more sense.

We also need to use the event field size for static strings when
copying the contents, otherwise we get random garbage in the key.

Also, cast string return values to avoid warnings on 32-bit compiles.

Finally, rearrange the code without changing any functionality by
moving the compound key updating code into a separate function.

Link: http://lkml.kernel.org/r/8976e1ab04b66bc2700ad1ed0768a2de85ac1983.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 63 ++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 418ff27cbbaf..4f4041d76926 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -530,8 +530,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			goto out;
 		}
 
-		if (is_string_field(field)) /* should be last key field */
-			key_size = HIST_KEY_SIZE_MAX - key_offset;
+		if (is_string_field(field))
+			key_size = MAX_FILTER_STR_VAL;
 		else
 			key_size = field->size;
 	}
@@ -830,9 +830,34 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 	}
 }
 
+static inline void add_to_key(char *compound_key, void *key,
+			      struct hist_field *key_field, void *rec)
+{
+	size_t size = key_field->size;
+
+	if (key_field->flags & HIST_FIELD_FL_STRING) {
+		struct ftrace_event_field *field;
+
+		field = key_field->field;
+		if (field->filter_type == FILTER_DYN_STRING)
+			size = *(u32 *)(rec + field->offset) >> 16;
+		else if (field->filter_type == FILTER_PTR_STRING)
+			size = strlen(key);
+		else if (field->filter_type == FILTER_STATIC_STRING)
+			size = field->size;
+
+		/* ensure NULL-termination */
+		if (size > key_field->size - 1)
+			size = key_field->size - 1;
+	}
+
+	memcpy(compound_key + key_field->offset, key, size);
+}
+
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	bool use_compound_key = (hist_data->n_keys > 1);
 	unsigned long entries[HIST_STACKTRACE_DEPTH];
 	char compound_key[HIST_KEY_SIZE_MAX];
 	struct stack_trace stacktrace;
@@ -842,8 +867,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 	void *key = NULL;
 	unsigned int i;
 
-	if (hist_data->n_keys > 1)
-		memset(compound_key, 0, hist_data->key_size);
+	memset(compound_key, 0, hist_data->key_size);
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -860,35 +884,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 			key = entries;
 		} else {
 			field_contents = key_field->fn(key_field, rec);
-			if (key_field->flags & HIST_FIELD_FL_STRING)
+			if (key_field->flags & HIST_FIELD_FL_STRING) {
 				key = (void *)(unsigned long)field_contents;
-			else
+				use_compound_key = true;
+			} else
 				key = (void *)&field_contents;
-
-			if (hist_data->n_keys > 1) {
-				/* ensure NULL-termination */
-				size_t size = key_field->size - 1;
-
-				if (key_field->flags & HIST_FIELD_FL_STRING) {
-					struct ftrace_event_field *field;
-
-					field = key_field->field;
-					if (field->filter_type == FILTER_DYN_STRING)
-						size = *(u32 *)(rec + field->offset) >> 16;
-					else if (field->filter_type == FILTER_PTR_STRING)
-						size = strlen(key);
-
-					if (size > key_field->size - 1)
-						size = key_field->size - 1;
-				}
-
-				memcpy(compound_key + key_field->offset, key,
-				       size);
-			}
 		}
+
+		if (use_compound_key)
+			add_to_key(compound_key, key, key_field, rec);
 	}
 
-	if (hist_data->n_keys > 1)
+	if (use_compound_key)
 		key = compound_key;
 
 	elt = tracing_map_insert(hist_data->map, key);
-- 
cgit v1.2.3


From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:55 -0600
Subject: tracing: Add enable_hist/disable_hist triggers

Similar to enable_event/disable_event triggers, these triggers enable
and disable the aggregation of events into maps rather than enabling
and disabling their writing into the trace buffer.

They can be used to automatically start and stop hist triggers based
on a matching filter condition.

If there's a paused hist trigger on system:event, the following would
start it when the filter condition was hit:

  # echo enable_hist:system:event [ if filter] > event/trigger

And the following would disable a running system:event hist trigger:

  # echo disable_hist:system:event [ if filter] > event/trigger

See Documentation/trace/events.txt for real examples.

Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c                |   8 +++
 kernel/trace/trace.h                |  32 ++++++++++
 kernel/trace/trace_events_hist.c    | 115 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |  71 ++++++++++++----------
 4 files changed, 195 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2238bfde799b..8430145bea12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3807,6 +3807,10 @@ static const char readme_msg[] =
 	"\t   trigger: traceon, traceoff\n"
 	"\t            enable_event:<system>:<event>\n"
 	"\t            disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t            enable_hist:<system>:<event>\n"
+	"\t            disable_hist:<system>:<event>\n"
+#endif
 #ifdef CONFIG_STACKTRACE
 	"\t\t    stacktrace\n"
 #endif
@@ -3867,6 +3871,10 @@ static const char readme_msg[] =
 	"\t    The 'clear' parameter will clear the contents of a running\n"
 	"\t    hist trigger and leave its current paused/active state\n"
 	"\t    unchanged.\n\n"
+	"\t    The enable_hist and disable_hist triggers can be used to\n"
+	"\t    have one event conditionally start and stop another event's\n"
+	"\t    already-attached hist trigger.  The syntax is analagous to\n"
+	"\t    the enable_event and disable_event triggers.\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 505f8a45f426..cab1f4bfe85b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops;
 
 #ifdef CONFIG_HIST_TRIGGERS
 extern int register_trigger_hist_cmd(void);
+extern int register_trigger_hist_enable_disable_cmds(void);
 #else
 static inline int register_trigger_hist_cmd(void) { return 0; }
+static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
 #endif
 
 extern int register_trigger_cmds(void);
@@ -1185,6 +1187,34 @@ struct event_trigger_data {
 	struct list_head		list;
 };
 
+/* Avoid typos */
+#define ENABLE_EVENT_STR	"enable_event"
+#define DISABLE_EVENT_STR	"disable_event"
+#define ENABLE_HIST_STR		"enable_hist"
+#define DISABLE_HIST_STR	"disable_hist"
+
+struct enable_trigger_data {
+	struct trace_event_file		*file;
+	bool				enable;
+	bool				hist;
+};
+
+extern int event_enable_trigger_print(struct seq_file *m,
+				      struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern int event_enable_trigger_func(struct event_command *cmd_ops,
+				     struct trace_event_file *file,
+				     char *glob, char *cmd, char *param);
+extern int event_enable_register_trigger(char *glob,
+					 struct event_trigger_ops *ops,
+					 struct event_trigger_data *data,
+					 struct trace_event_file *file);
+extern void event_enable_unregister_trigger(char *glob,
+					    struct event_trigger_ops *ops,
+					    struct event_trigger_data *test,
+					    struct trace_event_file *file);
 extern void trigger_data_free(struct event_trigger_data *data);
 extern int event_trigger_init(struct event_trigger_ops *ops,
 			      struct event_trigger_data *data);
@@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
 extern int register_event_command(struct event_command *cmd);
+extern int unregister_event_command(struct event_command *cmd);
+extern int register_trigger_hist_enable_disable_cmds(void);
 
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4f4041d76926..5d4f02792440 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void)
 
 	return ret;
 }
+
+static void
+hist_enable_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct enable_trigger_data *enable_data = data->private_data;
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (enable_data->enable)
+				test->paused = false;
+			else
+				test->paused = true;
+			break;
+		}
+	}
+}
+
+static void
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+{
+	if (!data->count)
+		return;
+
+	if (data->count != -1)
+		(data->count)--;
+
+	hist_enable_trigger(data, rec);
+}
+
+static struct event_trigger_ops hist_enable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_enable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops *
+hist_enable_get_trigger_ops(char *cmd, char *param)
+{
+	struct event_trigger_ops *ops;
+	bool enable;
+
+	enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
+
+	if (enable)
+		ops = param ? &hist_enable_count_trigger_ops :
+			&hist_enable_trigger_ops;
+	else
+		ops = param ? &hist_disable_count_trigger_ops :
+			&hist_disable_trigger_ops;
+
+	return ops;
+}
+
+static struct event_command trigger_hist_enable_cmd = {
+	.name			= ENABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static struct event_command trigger_hist_disable_cmd = {
+	.name			= DISABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static __init void unregister_trigger_hist_enable_disable_cmds(void)
+{
+	unregister_event_command(&trigger_hist_enable_cmd);
+	unregister_event_command(&trigger_hist_disable_cmd);
+}
+
+__init int register_trigger_hist_enable_disable_cmds(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_enable_cmd);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = register_event_command(&trigger_hist_disable_cmd);
+	if (WARN_ON(ret < 0))
+		unregister_trigger_hist_enable_disable_cmds();
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d29092afe005..d133f2094566 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd)
  * Currently we only unregister event commands from __init, so mark
  * this __init too.
  */
-static __init int unregister_event_command(struct event_command *cmd)
+__init int unregister_event_command(struct event_command *cmd)
 {
 	struct event_command *p, *n;
 	int ret = -ENODEV;
@@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 	unregister_event_command(&trigger_traceoff_cmd);
 }
 
-/* Avoid typos */
-#define ENABLE_EVENT_STR	"enable_event"
-#define DISABLE_EVENT_STR	"disable_event"
-
-struct enable_trigger_data {
-	struct trace_event_file		*file;
-	bool				enable;
-};
-
 static void
 event_enable_trigger(struct event_trigger_data *data, void *rec)
 {
@@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	event_enable_trigger(data, rec);
 }
 
-static int
-event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-			   struct event_trigger_data *data)
+int event_enable_trigger_print(struct seq_file *m,
+			       struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	seq_printf(m, "%s:%s:%s",
-		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+		   enable_data->hist ?
+		   (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) :
+		   (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR),
 		   enable_data->file->event_call->class->system,
 		   trace_event_name(enable_data->file->event_call));
 
@@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	return 0;
 }
 
-static void
-event_enable_trigger_free(struct event_trigger_ops *ops,
-			  struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 	.free			= event_enable_trigger_free,
 };
 
-static int
-event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct trace_event_file *file,
-			  char *glob, char *cmd, char *param)
+int event_enable_trigger_func(struct event_command *cmd_ops,
+			      struct trace_event_file *file,
+			      char *glob, char *cmd, char *param)
 {
 	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
@@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	struct trace_array *tr = file->tr;
 	const char *system;
 	const char *event;
+	bool hist = false;
 	char *trigger;
 	char *number;
 	bool enable;
@@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	if (!event_enable_file)
 		goto out;
 
-	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#ifdef CONFIG_HIST_TRIGGERS
+	hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
+		(strcmp(cmd, DISABLE_HIST_STR) == 0));
 
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
+	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#endif
 	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
 
 	ret = -ENOMEM;
@@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	INIT_LIST_HEAD(&trigger_data->list);
 	RCU_INIT_POINTER(trigger_data->filter, NULL);
 
+	enable_data->hist = hist;
 	enable_data->enable = enable;
 	enable_data->file = event_enable_file;
 	trigger_data->private_data = enable_data;
@@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	goto out;
 }
 
-static int event_enable_register_trigger(char *glob,
-					 struct event_trigger_ops *ops,
-					 struct event_trigger_data *data,
-					 struct trace_event_file *file)
+int event_enable_register_trigger(char *glob,
+				  struct event_trigger_ops *ops,
+				  struct event_trigger_data *data,
+				  struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob,
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		test_enable_data = test->private_data;
 		if (test_enable_data &&
+		    (test->cmd_ops->trigger_type ==
+		     data->cmd_ops->trigger_type) &&
 		    (test_enable_data->file == enable_data->file)) {
 			ret = -EEXIST;
 			goto out;
@@ -1343,10 +1345,10 @@ out:
 	return ret;
 }
 
-static void event_enable_unregister_trigger(char *glob,
-					    struct event_trigger_ops *ops,
-					    struct event_trigger_data *test,
-					    struct trace_event_file *file)
+void event_enable_unregister_trigger(char *glob,
+				     struct event_trigger_ops *ops,
+				     struct event_trigger_data *test,
+				     struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
@@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob,
 	list_for_each_entry_rcu(data, &file->triggers, list) {
 		enable_data = data->private_data;
 		if (enable_data &&
+		    (data->cmd_ops->trigger_type ==
+		     test->cmd_ops->trigger_type) &&
 		    (enable_data->file == test_enable_data->file)) {
 			unregistered = true;
 			list_del_rcu(&data->list);
@@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param)
 	struct event_trigger_ops *ops;
 	bool enable;
 
+#ifdef CONFIG_HIST_TRIGGERS
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
 	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-
+#endif
 	if (enable)
 		ops = param ? &event_enable_count_trigger_ops :
 			&event_enable_trigger_ops;
@@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_enable_disable_cmds();
 	register_trigger_hist_cmd();
 
 	return 0;
-- 
cgit v1.2.3


From 52a7f16dedff8f23d03df3ea556dec95b92a5801 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:57 -0600
Subject: tracing: Add support for multiple hist triggers per event

Allow users to define any number of hist triggers per trace event.
Any number of hist triggers may be added for a given event, which may
differ by key, value, or filter.

Reading the event's 'hist' file will display the output of all the
hist triggers defined on an event concatenated in the order they were
defined.

Link: http://lkml.kernel.org/r/48a0c8dd34c344571de880fb35e211c6d9a28961.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |   8 +-
 kernel/trace/trace_events_hist.c | 172 +++++++++++++++++++++++++++++++--------
 2 files changed, 145 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8430145bea12..e89b2ed76d2d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3856,9 +3856,11 @@ static const char readme_msg[] =
 	"\t    sort field.  The 'size' parameter can be used to specify more\n"
 	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout.  The default format used to\n"
-	"\t    display a given field can be modified by appending any of the\n"
-	"\t    following modifiers to the field name, as applicable:\n\n"
+	"\t    table in its entirety to stdout.  If there are multiple hist\n"
+	"\t    triggers attached to an event, there will be a table for each\n"
+	"\t    trigger in the output.  The default format used to display a\n"
+	"\t    given field can be modified by appending any of the following\n"
+	"\t    modifiers to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5d4f02792440..4b02f8ab4dd3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1032,33 +1032,18 @@ static int print_entries(struct seq_file *m,
 	return n_entries;
 }
 
-static int hist_show(struct seq_file *m, void *v)
+static void hist_trigger_show(struct seq_file *m,
+			      struct event_trigger_data *data, int n)
 {
-	struct event_trigger_data *test, *data = NULL;
-	struct trace_event_file *event_file;
 	struct hist_trigger_data *hist_data;
 	int n_entries, ret = 0;
 
-	mutex_lock(&event_mutex);
-
-	event_file = event_file_data(m->private);
-	if (unlikely(!event_file)) {
-		ret = -ENODEV;
-		goto out_unlock;
-	}
-
-	list_for_each_entry_rcu(test, &event_file->triggers, list) {
-		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			data = test;
-			break;
-		}
-	}
-	if (!data)
-		goto out_unlock;
+	if (n > 0)
+		seq_puts(m, "\n\n");
 
 	seq_puts(m, "# event histogram\n#\n# trigger info: ");
 	data->ops->print(m, data->ops, data);
-	seq_puts(m, "\n");
+	seq_puts(m, "#\n\n");
 
 	hist_data = data->private_data;
 	n_entries = print_entries(m, hist_data);
@@ -1070,6 +1055,27 @@ static int hist_show(struct seq_file *m, void *v)
 	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
 		   (u64)atomic64_read(&hist_data->map->hits),
 		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *data;
+	struct trace_event_file *event_file;
+	int n = 0, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(data, &event_file->triggers, list) {
+		if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
+			hist_trigger_show(m, data, n++);
+	}
+
  out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -1233,6 +1239,54 @@ static void hist_clear(struct event_trigger_data *data)
 	data->paused = paused;
 }
 
+static bool hist_trigger_match(struct event_trigger_data *data,
+			       struct event_trigger_data *data_test)
+{
+	struct tracing_map_sort_key *sort_key, *sort_key_test;
+	struct hist_trigger_data *hist_data, *hist_data_test;
+	struct hist_field *key_field, *key_field_test;
+	unsigned int i;
+
+	hist_data = data->private_data;
+	hist_data_test = data_test->private_data;
+
+	if (hist_data->n_vals != hist_data_test->n_vals ||
+	    hist_data->n_fields != hist_data_test->n_fields ||
+	    hist_data->n_sort_keys != hist_data_test->n_sort_keys)
+		return false;
+
+	if ((data->filter_str && !data_test->filter_str) ||
+	    (!data->filter_str && data_test->filter_str))
+		return false;
+
+	for_each_hist_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+		key_field_test = hist_data_test->fields[i];
+
+		if (key_field->flags != key_field_test->flags)
+			return false;
+		if (key_field->field != key_field_test->field)
+			return false;
+		if (key_field->offset != key_field_test->offset)
+			return false;
+	}
+
+	for (i = 0; i < hist_data->n_sort_keys; i++) {
+		sort_key = &hist_data->sort_keys[i];
+		sort_key_test = &hist_data_test->sort_keys[i];
+
+		if (sort_key->field_idx != sort_key_test->field_idx ||
+		    sort_key->descending != sort_key_test->descending)
+			return false;
+	}
+
+	if (data->filter_str &&
+	    (strcmp(data->filter_str, data_test->filter_str) != 0))
+		return false;
+
+	return true;
+}
+
 static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
@@ -1243,6 +1297,8 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test))
+				continue;
 			if (hist_data->attrs->pause)
 				test->paused = true;
 			else if (hist_data->attrs->cont)
@@ -1282,6 +1338,44 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	return ret;
 }
 
+static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
+				    struct event_trigger_data *data,
+				    struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	bool unregistered = false;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test))
+				continue;
+			unregistered = true;
+			list_del_rcu(&test->list);
+			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
+			break;
+		}
+	}
+
+	if (unregistered && test->ops->free)
+		test->ops->free(test->ops, test);
+}
+
+static void hist_unreg_all(struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			list_del_rcu(&test->list);
+			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
+			if (test->ops->free)
+				test->ops->free(test->ops, test);
+		}
+	}
+}
+
 static int event_hist_trigger_func(struct event_command *cmd_ops,
 				   struct trace_event_file *file,
 				   char *glob, char *cmd, char *param)
@@ -1331,22 +1425,19 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 
 	trigger_data->private_data = hist_data;
 
+	/* if param is non-empty, it's supposed to be a filter */
+	if (param && cmd_ops->set_filter) {
+		ret = cmd_ops->set_filter(param, trigger_data, file);
+		if (ret < 0)
+			goto out_free;
+	}
+
 	if (glob[0] == '!') {
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
 		ret = 0;
 		goto out_free;
 	}
 
-	if (!param) /* if param is non-empty, it's supposed to be a filter */
-		goto out_reg;
-
-	if (!cmd_ops->set_filter)
-		goto out_reg;
-
-	ret = cmd_ops->set_filter(param, trigger_data, file);
-	if (ret < 0)
-		goto out_free;
- out_reg:
 	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
 	/*
 	 * The above returns on success the # of triggers registered,
@@ -1379,7 +1470,8 @@ static struct event_command trigger_hist_cmd = {
 	.flags			= EVENT_CMD_FL_NEEDS_REC,
 	.func			= event_hist_trigger_func,
 	.reg			= hist_register_trigger,
-	.unreg			= unregister_trigger,
+	.unreg			= hist_unregister_trigger,
+	.unreg_all		= hist_unreg_all,
 	.get_trigger_ops	= event_hist_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
@@ -1406,7 +1498,6 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
 				test->paused = false;
 			else
 				test->paused = true;
-			break;
 		}
 	}
 }
@@ -1469,12 +1560,28 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
 	return ops;
 }
 
+static void hist_enable_unreg_all(struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
+			list_del_rcu(&test->list);
+			update_cond_flag(file);
+			trace_event_trigger_enable_disable(file, 0);
+			if (test->ops->free)
+				test->ops->free(test->ops, test);
+		}
+	}
+}
+
 static struct event_command trigger_hist_enable_cmd = {
 	.name			= ENABLE_HIST_STR,
 	.trigger_type		= ETT_HIST_ENABLE,
 	.func			= event_enable_trigger_func,
 	.reg			= event_enable_register_trigger,
 	.unreg			= event_enable_unregister_trigger,
+	.unreg_all		= hist_enable_unreg_all,
 	.get_trigger_ops	= hist_enable_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
@@ -1485,6 +1592,7 @@ static struct event_command trigger_hist_disable_cmd = {
 	.func			= event_enable_trigger_func,
 	.reg			= event_enable_register_trigger,
 	.unreg			= event_enable_unregister_trigger,
+	.unreg_all		= hist_enable_unreg_all,
 	.get_trigger_ops	= hist_enable_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
-- 
cgit v1.2.3


From db1388b4ffa9e31e9ff0abacc3bdb121bec8c688 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:58 -0600
Subject: tracing: Add support for named triggers

Named triggers are sets of triggers that share a common set of trigger
data.  An example of functionality that could benefit from this type
of capability would be a set of inlined probes that would each
contribute event counts, for example, to a shared counter data
structure.

The first named trigger registered with a given name owns the common
trigger data that the others subsequently registered with the same
name will reference.  The functions defined here allow users to add,
delete, and find named triggers.

It also adds functions to pause and unpause named triggers; since
named triggers act upon common data, they should also be paused and
unpaused as a group.

Link: http://lkml.kernel.org/r/c09ff648360f65b10a3e321eddafe18060b4a04f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                |  13 ++++
 kernel/trace/trace_events_trigger.c | 143 ++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cab1f4bfe85b..727a3d28bce5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1184,7 +1184,11 @@ struct event_trigger_data {
 	char				*filter_str;
 	void				*private_data;
 	bool				paused;
+	bool				paused_tmp;
 	struct list_head		list;
+	char				*name;
+	struct list_head		named_list;
+	struct event_trigger_data	*named_data;
 };
 
 /* Avoid typos */
@@ -1227,6 +1231,15 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
+extern struct event_trigger_data *find_named_trigger(const char *name);
+extern bool is_named_trigger(struct event_trigger_data *test);
+extern int save_named_trigger(const char *name,
+			      struct event_trigger_data *data);
+extern void del_named_trigger(struct event_trigger_data *data);
+extern void pause_named_trigger(struct event_trigger_data *data);
+extern void unpause_named_trigger(struct event_trigger_data *data);
+extern void set_named_trigger_data(struct event_trigger_data *data,
+				   struct event_trigger_data *named_data);
 extern int register_event_command(struct event_command *cmd);
 extern int unregister_event_command(struct event_command *cmd);
 extern int register_trigger_hist_enable_disable_cmds(void);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d133f2094566..a975571cde24 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops,
 	trigger_data->ops = trigger_ops;
 	trigger_data->cmd_ops = cmd_ops;
 	INIT_LIST_HEAD(&trigger_data->list);
+	INIT_LIST_HEAD(&trigger_data->named_list);
 
 	if (glob[0] == '!') {
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
@@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str,
 	return ret;
 }
 
+static LIST_HEAD(named_triggers);
+
+/**
+ * find_named_trigger - Find the common named trigger associated with @name
+ * @name: The name of the set of named triggers to find the common data for
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data.  The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference.  This function
+ * returns the common trigger data associated with that first
+ * registered instance.
+ *
+ * Return: the common trigger data for the given named trigger on
+ * success, NULL otherwise.
+ */
+struct event_trigger_data *find_named_trigger(const char *name)
+{
+	struct event_trigger_data *data;
+
+	if (!name)
+		return NULL;
+
+	list_for_each_entry(data, &named_triggers, named_list) {
+		if (data->named_data)
+			continue;
+		if (strcmp(data->name, name) == 0)
+			return data;
+	}
+
+	return NULL;
+}
+
+/**
+ * is_named_trigger - determine if a given trigger is a named trigger
+ * @test: The trigger data to test
+ *
+ * Return: true if 'test' is a named trigger, false otherwise.
+ */
+bool is_named_trigger(struct event_trigger_data *test)
+{
+	struct event_trigger_data *data;
+
+	list_for_each_entry(data, &named_triggers, named_list) {
+		if (test == data)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * save_named_trigger - save the trigger in the named trigger list
+ * @name: The name of the named trigger set
+ * @data: The trigger data to save
+ *
+ * Return: 0 if successful, negative error otherwise.
+ */
+int save_named_trigger(const char *name, struct event_trigger_data *data)
+{
+	data->name = kstrdup(name, GFP_KERNEL);
+	if (!data->name)
+		return -ENOMEM;
+
+	list_add(&data->named_list, &named_triggers);
+
+	return 0;
+}
+
+/**
+ * del_named_trigger - delete a trigger from the named trigger list
+ * @data: The trigger data to delete
+ */
+void del_named_trigger(struct event_trigger_data *data)
+{
+	kfree(data->name);
+	data->name = NULL;
+
+	list_del(&data->named_list);
+}
+
+static void __pause_named_trigger(struct event_trigger_data *data, bool pause)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry(test, &named_triggers, named_list) {
+		if (strcmp(test->name, data->name) == 0) {
+			if (pause) {
+				test->paused_tmp = test->paused;
+				test->paused = true;
+			} else {
+				test->paused = test->paused_tmp;
+			}
+		}
+	}
+}
+
+/**
+ * pause_named_trigger - Pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to pause
+ *
+ * Pauses a named trigger along with all other triggers having the
+ * same name.  Because named triggers share a common set of data,
+ * pausing only one is meaningless, so pausing one named trigger needs
+ * to pause all triggers with the same name.
+ */
+void pause_named_trigger(struct event_trigger_data *data)
+{
+	__pause_named_trigger(data, true);
+}
+
+/**
+ * unpause_named_trigger - Un-pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Un-pauses a named trigger along with all other triggers having the
+ * same name.  Because named triggers share a common set of data,
+ * unpausing only one is meaningless, so unpausing one named trigger
+ * needs to unpause all triggers with the same name.
+ */
+void unpause_named_trigger(struct event_trigger_data *data)
+{
+	__pause_named_trigger(data, false);
+}
+
+/**
+ * set_named_trigger_data - Associate common named trigger data
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data.  The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference.  This function
+ * associates the common trigger data from the first trigger with the
+ * given trigger.
+ */
+void set_named_trigger_data(struct event_trigger_data *data,
+			    struct event_trigger_data *named_data)
+{
+	data->named_data = named_data;
+}
+
 static void
 traceon_trigger(struct event_trigger_data *data, void *rec)
 {
-- 
cgit v1.2.3


From 5463bfda327b1f7310556ef3136533e27c774f13 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:59 -0600
Subject: tracing: Add support for named hist triggers

Allow users to define 'named' hist triggers.  All triggers created
with the same 'name=xxx' option will update the same shared histogram
data.

This expands the hist trigger syntax from this:

    # echo hist:keys=xxx ... [ if filter] > event/trigger

to this:

    # echo hist:name=xxx:keys=xxx ... [ if filter] > event/trigger

Named histograms must use a 'compatible' set of keys and values, which
means each event added to a set of named triggers must have the same
names and types.

Reading the 'hist' file of any of the participating events will
produce the same output as any other participating event, which is to
be expected since they share the same data.

Link: http://lkml.kernel.org/r/1dbc84ee3322a75daaf5b3ef1d0cc0a2fb682fc7.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  14 ++--
 kernel/trace/trace_events_hist.c | 148 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 141 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e89b2ed76d2d..4e342d354c12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3842,6 +3842,7 @@ static const char readme_msg[] =
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [:pause][:continue][:clear]\n"
+	"\t            [:name=histname1]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3854,13 +3855,18 @@ static const char readme_msg[] =
 	"\t    specified using the 'sort' keyword.  The sort direction can\n"
 	"\t    be modified by appending '.descending' or '.ascending' to a\n"
 	"\t    sort field.  The 'size' parameter can be used to specify more\n"
-	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
+	"\t    or fewer than the default 2048 entries for the hashtable size.\n"
+	"\t    If a hist trigger is given a name using the 'name' parameter,\n"
+	"\t    its histogram data will be shared with other triggers of the\n"
+	"\t    same name, and trigger hits will update this common data.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout.  If there are multiple hist\n"
 	"\t    triggers attached to an event, there will be a table for each\n"
-	"\t    trigger in the output.  The default format used to display a\n"
-	"\t    given field can be modified by appending any of the following\n"
-	"\t    modifiers to the field name, as applicable:\n\n"
+	"\t    trigger in the output.  The table displayed for a named\n"
+	"\t    trigger will be the same as any other instance having the\n"
+	"\t    same name.  The default format used to display a given field\n"
+	"\t    can be modified by appending any of the following modifiers\n"
+	"\t    to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4b02f8ab4dd3..bdea5603cbde 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -117,6 +117,7 @@ struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
 	char		*sort_key_str;
+	char		*name;
 	bool		pause;
 	bool		cont;
 	bool		clear;
@@ -200,6 +201,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 	if (!attrs)
 		return;
 
+	kfree(attrs->name);
 	kfree(attrs->sort_key_str);
 	kfree(attrs->keys_str);
 	kfree(attrs->vals_str);
@@ -227,6 +229,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
 			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "name=", strlen("name=")) == 0)
+			attrs->name = kstrdup(str, GFP_KERNEL);
 		else if (strcmp(str, "pause") == 0)
 			attrs->pause = true;
 		else if ((strcmp(str, "cont") == 0) ||
@@ -1131,7 +1135,12 @@ static int event_hist_trigger_print(struct seq_file *m,
 	struct hist_field *key_field;
 	unsigned int i;
 
-	seq_puts(m, "hist:keys=");
+	seq_puts(m, "hist:");
+
+	if (data->name)
+		seq_printf(m, "%s:", data->name);
+
+	seq_puts(m, "keys=");
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -1196,6 +1205,19 @@ static int event_hist_trigger_print(struct seq_file *m,
 	return 0;
 }
 
+static int event_hist_trigger_init(struct event_trigger_ops *ops,
+				   struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (!data->ref && hist_data->attrs->name)
+		save_named_trigger(hist_data->attrs->name, data);
+
+	data->ref++;
+
+	return 0;
+}
+
 static void event_hist_trigger_free(struct event_trigger_ops *ops,
 				    struct event_trigger_data *data)
 {
@@ -1206,6 +1228,8 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
 
 	data->ref--;
 	if (!data->ref) {
+		if (data->name)
+			del_named_trigger(data);
 		trigger_data_free(data);
 		destroy_hist_data(hist_data);
 	}
@@ -1214,10 +1238,44 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
 static struct event_trigger_ops event_hist_trigger_ops = {
 	.func			= event_hist_trigger,
 	.print			= event_hist_trigger_print,
-	.init			= event_trigger_init,
+	.init			= event_hist_trigger_init,
 	.free			= event_hist_trigger_free,
 };
 
+static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
+					 struct event_trigger_data *data)
+{
+	data->ref++;
+
+	save_named_trigger(data->named_data->name, data);
+
+	event_hist_trigger_init(ops, data->named_data);
+
+	return 0;
+}
+
+static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
+					  struct event_trigger_data *data)
+{
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	event_hist_trigger_free(ops, data->named_data);
+
+	data->ref--;
+	if (!data->ref) {
+		del_named_trigger(data);
+		trigger_data_free(data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_named_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_hist_trigger_named_init,
+	.free			= event_hist_trigger_named_free,
+};
+
 static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 							    char *param)
 {
@@ -1227,26 +1285,54 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 static void hist_clear(struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
-	bool paused;
 
-	paused = data->paused;
-	data->paused = true;
+	if (data->name)
+		pause_named_trigger(data);
 
 	synchronize_sched();
 
 	tracing_map_clear(hist_data->map);
 
-	data->paused = paused;
+	if (data->name)
+		unpause_named_trigger(data);
+}
+
+static bool compatible_field(struct ftrace_event_field *field,
+			     struct ftrace_event_field *test_field)
+{
+	if (field == test_field)
+		return true;
+	if (field == NULL || test_field == NULL)
+		return false;
+	if (strcmp(field->name, test_field->name) != 0)
+		return false;
+	if (strcmp(field->type, test_field->type) != 0)
+		return false;
+	if (field->size != test_field->size)
+		return false;
+	if (field->is_signed != test_field->is_signed)
+		return false;
+
+	return true;
 }
 
 static bool hist_trigger_match(struct event_trigger_data *data,
-			       struct event_trigger_data *data_test)
+			       struct event_trigger_data *data_test,
+			       struct event_trigger_data *named_data,
+			       bool ignore_filter)
 {
 	struct tracing_map_sort_key *sort_key, *sort_key_test;
 	struct hist_trigger_data *hist_data, *hist_data_test;
 	struct hist_field *key_field, *key_field_test;
 	unsigned int i;
 
+	if (named_data && (named_data != data_test) &&
+	    (named_data != data_test->named_data))
+		return false;
+
+	if (!named_data && is_named_trigger(data_test))
+		return false;
+
 	hist_data = data->private_data;
 	hist_data_test = data_test->private_data;
 
@@ -1255,9 +1341,11 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 	    hist_data->n_sort_keys != hist_data_test->n_sort_keys)
 		return false;
 
-	if ((data->filter_str && !data_test->filter_str) ||
-	    (!data->filter_str && data_test->filter_str))
-		return false;
+	if (!ignore_filter) {
+		if ((data->filter_str && !data_test->filter_str) ||
+		   (!data->filter_str && data_test->filter_str))
+			return false;
+	}
 
 	for_each_hist_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -1265,7 +1353,7 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 
 		if (key_field->flags != key_field_test->flags)
 			return false;
-		if (key_field->field != key_field_test->field)
+		if (!compatible_field(key_field->field, key_field_test->field))
 			return false;
 		if (key_field->offset != key_field_test->offset)
 			return false;
@@ -1280,7 +1368,7 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 			return false;
 	}
 
-	if (data->filter_str &&
+	if (!ignore_filter && data->filter_str &&
 	    (strcmp(data->filter_str, data_test->filter_str) != 0))
 		return false;
 
@@ -1292,12 +1380,26 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct trace_event_file *file)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *named_data = NULL;
 	int ret = 0;
 
+	if (hist_data->attrs->name) {
+		named_data = find_named_trigger(hist_data->attrs->name);
+		if (named_data) {
+			if (!hist_trigger_match(data, named_data, named_data,
+						true)) {
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	}
+
+	if (hist_data->attrs->name && !named_data)
+		goto new;
+
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test))
+			if (!hist_trigger_match(data, test, named_data, false))
 				continue;
 			if (hist_data->attrs->pause)
 				test->paused = true;
@@ -1310,12 +1412,19 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 			goto out;
 		}
 	}
-
+ new:
 	if (hist_data->attrs->cont || hist_data->attrs->clear) {
 		ret = -ENOENT;
 		goto out;
 	}
 
+	if (named_data) {
+		destroy_hist_data(data->private_data);
+		data->private_data = named_data->private_data;
+		set_named_trigger_data(data, named_data);
+		data->ops = &event_hist_trigger_named_ops;
+	}
+
 	if (hist_data->attrs->pause)
 		data->paused = true;
 
@@ -1329,6 +1438,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	ret++;
 
 	update_cond_flag(file);
+
 	if (trace_event_trigger_enable_disable(file, 1) < 0) {
 		list_del_rcu(&data->list);
 		update_cond_flag(file);
@@ -1342,12 +1452,16 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 				    struct event_trigger_data *data,
 				    struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct event_trigger_data *test, *named_data = NULL;
 	bool unregistered = false;
 
+	if (hist_data->attrs->name)
+		named_data = find_named_trigger(hist_data->attrs->name);
+
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test))
+			if (!hist_trigger_match(data, test, named_data, false))
 				continue;
 			unregistered = true;
 			list_del_rcu(&test->list);
-- 
cgit v1.2.3


From 4b94f5b7b4a5ffd885609bd033af2ecca0c9cc54 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 3 Mar 2016 12:55:02 -0600
Subject: tracing: Add hist trigger 'log2' modifier

Allow users to have numeric fields displayed as log2 values in case
value range is very wide by appending '.log2' to field names.

For example,

  # echo 'hist:key=bytes_req' > kmalloc/trigger
  # cat kmalloc/hist

  { bytes_req:        504 } hitcount:          1
  { bytes_req:         11 } hitcount:          1
  { bytes_req:        104 } hitcount:          1
  { bytes_req:         48 } hitcount:          1
  { bytes_req:       2048 } hitcount:          1
  { bytes_req:       4096 } hitcount:          1
  { bytes_req:        240 } hitcount:          1
  { bytes_req:        392 } hitcount:          1
  { bytes_req:         13 } hitcount:          1
  { bytes_req:         28 } hitcount:          1
  { bytes_req:         12 } hitcount:          1
  { bytes_req:         64 } hitcount:          2
  { bytes_req:        128 } hitcount:          2
  { bytes_req:         32 } hitcount:          2
  { bytes_req:          8 } hitcount:         11
  { bytes_req:         10 } hitcount:         13
  { bytes_req:         24 } hitcount:         25
  { bytes_req:        160 } hitcount:         29
  { bytes_req:         16 } hitcount:         33
  { bytes_req:         80 } hitcount:         36

When using '.log2' modifier, the output looks like:

  # echo 'hist:key=bytes_req.log2' > kmalloc/trigger
  # cat kmalloc/hist

  { bytes_req: ~ 2^12 } hitcount:          1
  { bytes_req: ~ 2^11 } hitcount:          1
  { bytes_req: ~ 2^9  } hitcount:          2
  { bytes_req: ~ 2^6  } hitcount:          3
  { bytes_req: ~ 2^3  } hitcount:         13
  { bytes_req: ~ 2^5  } hitcount:         19
  { bytes_req: ~ 2^8  } hitcount:         49
  { bytes_req: ~ 2^7  } hitcount:         57
  { bytes_req: ~ 2^4  } hitcount:         74

Link: http://lkml.kernel.org/r/7ff396b246c6a881f46b979735fddf05a0d6c71a.1457029949.git.tom.zanussi@linux.intel.com

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  1 +
 kernel/trace/trace_events_hist.c | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4e342d354c12..988a35263fdd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3872,6 +3872,7 @@ static const char readme_msg[] =
 	"\t            .sym-offset display an address as a symbol and offset\n"
 	"\t            .execname   display a common_pid as a program name\n"
 	"\t            .syscall    display a syscall id as a syscall name\n\n"
+	"\t            .log2       display log2 value rather than raw number\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index bdea5603cbde..23df38aab503 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -68,6 +68,13 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
 	return (u64)(unsigned long)*addr;
 }
 
+static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+{
+	u64 val = *(u64 *)(event + hist_field->field->offset);
+
+	return (u64) ilog2(roundup_pow_of_two(val));
+}
+
 #define DEFINE_HIST_FIELD_FN(type)					\
 static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
 {									\
@@ -111,6 +118,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_EXECNAME		= 64,
 	HIST_FIELD_FL_SYSCALL		= 128,
 	HIST_FIELD_FL_STACKTRACE	= 256,
+	HIST_FIELD_FL_LOG2		= 512,
 };
 
 struct hist_trigger_attrs {
@@ -358,6 +366,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (flags & HIST_FIELD_FL_LOG2) {
+		hist_field->fn = hist_field_log2;
+		goto out;
+	}
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 
@@ -522,6 +535,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 				flags |= HIST_FIELD_FL_EXECNAME;
 			else if (strcmp(field_str, "syscall") == 0)
 				flags |= HIST_FIELD_FL_SYSCALL;
+			else if (strcmp(field_str, "log2") == 0)
+				flags |= HIST_FIELD_FL_LOG2;
 			else {
 				ret = -EINVAL;
 				goto out;
@@ -980,6 +995,9 @@ hist_trigger_entry_print(struct seq_file *m,
 						      key + key_field->offset,
 						      HIST_STACKTRACE_DEPTH);
 			multiline = true;
+		} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
+			seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+				   *(u64 *)(key + key_field->offset));
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -1112,6 +1130,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "execname";
 	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
 		flags_str = "syscall";
+	else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+		flags_str = "log2";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From d50c744ecde7ee3ba4d7ffb0e1c55e7a2f6bbc8e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 8 Mar 2016 17:17:15 -0500
Subject: tracing: Fix unsigned comparison to zero in hist trigger code

Fengguang Wu's bot found two comparisons of unsigned integers to zero. These
were real bugs, as it would miss error conditions returned to zero.

trace_events_hist.c:426:6-9: WARNING: Unsigned expression compared with zero: idx < 0
trace_events_hist.c:568:5-14: WARNING: Unsigned expression compared with zero: n_entries < 0

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 23df38aab503..f98b6b3a2804 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -734,7 +734,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 	struct tracing_map *map = hist_data->map;
 	struct ftrace_event_field *field;
 	struct hist_field *hist_field;
-	unsigned int i, idx;
+	int i, idx;
 
 	for_each_hist_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
@@ -1036,7 +1036,7 @@ static int print_entries(struct seq_file *m,
 {
 	struct tracing_map_sort_entry **sort_entries = NULL;
 	struct tracing_map *map = hist_data->map;
-	unsigned int i, n_entries;
+	int i, n_entries;
 
 	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
 					     hist_data->n_sort_keys,
-- 
cgit v1.2.3


From 205506228b69be4efbb3809957a69420f26a8d9f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 25 Apr 2016 22:40:12 -0400
Subject: tracing: Do not inherit event-fork option for instances

As the event-fork option requires doing work when enabled and disabled, it
can not be passed down to created instances. The instance must clear this
flag when it is created, and must clear it when its removed.

As more options may be created with this need, a macro ZEROED_TRACE_FLAGS is
created that holds the flags that must not be inherited by the top level
instance, and must be cleared on removal of instances.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 988a35263fdd..5e3ad3481e4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec)
 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |			\
 	       TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
 
+/* trace_flags that are default zero for instances */
+#define ZEROED_TRACE_FLAGS \
+	TRACE_ITER_EVENT_FORK
 
 /*
  * The global_trace is the descriptor that holds the tracing
@@ -6710,7 +6713,7 @@ static int instance_mkdir(const char *name)
 	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
-	tr->trace_flags = global_trace.trace_flags;
+	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
 
@@ -6784,6 +6787,12 @@ static int instance_rmdir(const char *name)
 
 	list_del(&tr->list);
 
+	/* Disable all the flags that were enabled coming in */
+	for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+		if ((1 << i) & ZEROED_TRACE_FLAGS)
+			set_tracer_flag(tr, 1 << i, 0);
+	}
+
 	tracing_set_nop(tr);
 	event_trace_del_tracer(tr);
 	ftrace_destroy_function_files(tr);
-- 
cgit v1.2.3


From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Mon, 25 Apr 2016 23:22:35 -0400
Subject: ext4: fix races between changing inode journal mode and
 ext4_writepages

In ext4, there is a race condition between changing inode journal mode
and ext4_writepages(). While ext4_writepages() is executed on a
non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 kernel/locking/percpu-rwsem.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..bec0b647f9cc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely
-- 
cgit v1.2.3


From 4812952f9c94f67b3cc78ad0a6482bf182aa5a44 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 23 Apr 2016 13:23:47 +0300
Subject: tracing: checking for NULL instead of IS_ERR()

tracing_map_elt_alloc() returns ERR_PTRs on error, never NULL.

Fixes: 08d43a5fa063 ('tracing: Add lock-free tracing_map')
Link: http://lkml.kernel.org/r/20160423102347.GA11136@mwanda

Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index e0f172932eca..e7dfc5eec669 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -814,7 +814,7 @@ static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
 	unsigned int i;
 
 	dup_elt = tracing_map_elt_alloc(elt->map);
-	if (!dup_elt)
+	if (IS_ERR(dup_elt))
 		return NULL;
 
 	if (elt->map->ops && elt->map->ops->elt_copy)
-- 
cgit v1.2.3


From 432480c58219eff32904b879eb3fcc1d268a3b06 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 25 Apr 2016 14:01:27 -0500
Subject: tracing: Add check for NULL event field when creating hist field

Smatch flagged create_hist_field() as possibly being able to
dereference a NULL pointer, although the current code exits in all
cases where the event field could be NULL, so it's not actually a
problem.

Still, to prevent future changes to the code from overlooking new
cases, make the NULL pointer check explicit and warn once in that
case.

Link: http://lkml.kernel.org/r/cfbc003f534a3e441b4313272fd412310aba6336.1461610073.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f98b6b3a2804..0c05b8a99806 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -371,6 +371,9 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (WARN_ON_ONCE(!field))
+		goto out;
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 
-- 
cgit v1.2.3


From 6e4cf657defa8fb06dbf9ed91adb78414758f259 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 25 Apr 2016 14:01:28 -0500
Subject: tracing: Handle tracing_map_alloc_elts() error path correctly

If tracing_map_elt_alloc() fails, it will return ERR_PTR() instead of
NULL, so change the check to IS_ERROR().  We also need to set the
failed entry in the map->elts array to NULL instead of ERR_PTR() so
tracing_map_free_elts() doesn't try freeing an ERR_PTR().

tracing_map_free_elts() should also zero out what it frees so a
reentrant call won't find previously freed elements.

Link: http://lkml.kernel.org/r/f29d03b00bce3aac8cf151a8a30e6c83e5fee66d.1461610073.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index e7dfc5eec669..0a689bbb78ef 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -366,10 +366,13 @@ static void tracing_map_free_elts(struct tracing_map *map)
 	if (!map->elts)
 		return;
 
-	for (i = 0; i < map->max_elts; i++)
+	for (i = 0; i < map->max_elts; i++) {
 		tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+		*(TRACING_MAP_ELT(map->elts, i)) = NULL;
+	}
 
 	tracing_map_array_free(map->elts);
+	map->elts = NULL;
 }
 
 static int tracing_map_alloc_elts(struct tracing_map *map)
@@ -383,7 +386,8 @@ static int tracing_map_alloc_elts(struct tracing_map *map)
 
 	for (i = 0; i < map->max_elts; i++) {
 		*(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
-		if (!(*(TRACING_MAP_ELT(map->elts, i)))) {
+		if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) {
+			*(TRACING_MAP_ELT(map->elts, i)) = NULL;
 			tracing_map_free_elts(map);
 
 			return -ENOMEM;
-- 
cgit v1.2.3


From 4afe6495e5cb3c352d95f07512cbb227e607e2ce Mon Sep 17 00:00:00 2001
From: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Date: Mon, 18 Apr 2016 15:23:29 +0800
Subject: tracing: Don't use the address of the buffer array name in
 copy_from_user

With the following code snippet:

    ...
    char buf[64];
    ...
    if (copy_from_user(&buf, ubuf, cnt))
    ...

Even though the value of "&buf" equals "buf", but there is no need
to get the address of the "buf" again. Use "buf" instead of "&buf".

Link: http://lkml.kernel.org/r/20160418152329.18b72bea@debian

Signed-off-by: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5e3ad3481e4b..46028d47d252 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3664,7 +3664,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
@@ -4537,7 +4537,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	if (cnt > MAX_TRACER_SIZE)
 		cnt = MAX_TRACER_SIZE;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
@@ -5327,7 +5327,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
-- 
cgit v1.2.3


From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 21 Apr 2016 14:14:01 -0400
Subject: audit: add tty field to LOGIN event

The tty field was missing from AUDIT_LOGIN events.

Refactor code to create a new function audit_get_tty(), using it to
replace the call in audit_log_task_info() and to add it to
audit_log_set_loginuid().  Lock and bump the kref to protect it, adding
audit_put_tty() alias to decrement it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c   | 18 +++++-------------
 kernel/auditsc.c |  8 ++++++--
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f52fbefede09..384374a1d232 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -64,7 +64,6 @@
 #include <linux/security.h>
 #endif
 #include <linux/freezer.h>
-#include <linux/tty.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
 
@@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
 	char comm[sizeof(tsk->comm)];
-	char *tty;
+	struct tty_struct *tty;
 
 	if (!ab)
 		return;
 
 	/* tsk == current */
 	cred = current_cred();
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-		tty = tsk->signal->tty->name;
-	else
-		tty = "(none)";
-	spin_unlock_irq(&tsk->sighand->siglock);
-
+	tty = audit_get_tty(tsk);
 	audit_log_format(ab,
 			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
@@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 from_kgid(&init_user_ns, cred->egid),
 			 from_kgid(&init_user_ns, cred->sgid),
 			 from_kgid(&init_user_ns, cred->fsgid),
-			 tty, audit_get_sessionid(tsk));
-
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(tsk));
+	audit_put_tty(tty);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
 	audit_log_d_path_exe(ab, tsk->mm);
 	audit_log_task_context(ab);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 195ffaee50b9..71e14d836e69 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 {
 	struct audit_buffer *ab;
 	uid_t uid, oldloginuid, loginuid;
+	struct tty_struct *tty;
 
 	if (!audit_enabled)
 		return;
@@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
+	tty = audit_get_tty(current);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
 	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
-			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+			 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+			 oldsessionid, sessionid, !rc);
+	audit_put_tty(tty);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.2.3


From 7132e2d669bd42c3783327f301aaac5f4463299b Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Mon, 25 Apr 2016 18:56:14 -0300
Subject: ftrace: Match dot symbols when searching functions on ppc64

In the ppc64 big endian ABI, function symbols point to function
descriptors. The symbols which point to the function entry points
have a dot in front of the function name. Consequently, when the
ftrace filter mechanism searches for the symbol corresponding to
an entry point address, it gets the dot symbol.

As a result, ftrace filter users have to be aware of this ABI detail on
ppc64 and prepend a dot to the function name when setting the filter.

The perf probe command insulates the user from this by ignoring the dot
in front of the symbol name when matching function names to symbols,
but the sysfs interface does not. This patch makes the ftrace filter
mechanism do the same when searching symbols.

Fixes the following failure in ftracetest's kprobe_ftrace.tc:

  .../kprobe_ftrace.tc: line 9: echo: write error: Invalid argument

That failure is on this line of kprobe_ftrace.tc:

  echo _do_fork > set_ftrace_filter

This is because there's no _do_fork entry in the functions list:

  # cat available_filter_functions | grep _do_fork
  ._do_fork

This change introduces no regressions on the perf and ftracetest
testsuite results.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/trace/ftrace.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e8d792da963..a6c8252d7776 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3456,11 +3456,23 @@ struct ftrace_glob {
 	int type;
 };
 
+/*
+ * If symbols in an architecture don't correspond exactly to the user-visible
+ * name of what they represent, it is possible to define this function to
+ * perform the necessary adjustments.
+*/
+char * __weak arch_ftrace_match_adjust(char *str, const char *search)
+{
+	return str;
+}
+
 static int ftrace_match(char *str, struct ftrace_glob *g)
 {
 	int matched = 0;
 	int slen;
 
+	str = arch_ftrace_match_adjust(str, g->search);
+
 	switch (g->type) {
 	case MATCH_FULL:
 		if (strcmp(str, g->search) == 0)
-- 
cgit v1.2.3


From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Apr 2016 21:22:22 -0400
Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header

The functions event_trigger_unlock_commit() and
event_trigger_unlock_commit_regs() are no longer used outside the tracing
system. Move them out of the generic headers and into the local one.

Along with __event_trigger_test_discard() that is only used by them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 727a3d28bce5..c0eac7b1e5a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,100 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct trace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct trace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
-- 
cgit v1.2.3


From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 10:13:46 -0400
Subject: tracing: Make filter_check_discard() local

Nothing outside of the tracing directory calls filter_check_discard() or
check_filter_check_discard(). They should not be called by modules. Move
their prototypes into the local tracing header and remove their
EXPORT_SYMBOL() macros.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 --
 kernel/trace/trace.h | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46028d47d252..02f5a5f51d49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(filter_check_discard);
 
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
@@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
 
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c0eac7b1e5a6..ee8691c66bfe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,12 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+extern int filter_check_discard(struct trace_event_file *file, void *rec,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From 9cbb1506ab2db987c160e7fc50665bf47b5b6fa1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 11:09:42 -0400
Subject: tracing: Fold filter_check_discard() into its only user

The function filter_check_discard() is small and only called by one user,
its code can be folded into that one caller and make the code a bit less
comlplex.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 13 -------------
 kernel/trace/trace.h | 13 ++++++-------
 2 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02f5a5f51d49..1ba54e241c8d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -306,19 +306,6 @@ void trace_array_put(struct trace_array *this_tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-int filter_check_discard(struct trace_event_file *file, void *rec,
-			 struct ring_buffer *buffer,
-			 struct ring_buffer_event *event)
-{
-	if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
-	    !filter_match_preds(file->filter, rec)) {
-		ring_buffer_discard_commit(buffer, event);
-		return 1;
-	}
-
-	return 0;
-}
-
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
 			      struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee8691c66bfe..0862e7559548 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,9 +1065,6 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
-extern int filter_check_discard(struct trace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
@@ -1096,12 +1093,14 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
 		*tt = event_triggers_call(file, entry);
 
-	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
+	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+	     !filter_match_preds(file->filter, entry))) {
 		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
+		return true;
+	}
 
-	return true;
+	return false;
 }
 
 /**
-- 
cgit v1.2.3


From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Apr 2016 12:04:13 -0400
Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header

The functions trace_buffer_unlock_commit() and the _regs() version are only
used within the kernel/trace directory. Move them to the local header and
remove the export as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |  2 --
 kernel/trace/trace.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ba54e241c8d..94e7e4d11b79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
 static struct ring_buffer *temp_buffer;
 
@@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
 
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0862e7559548..bd5ae56dec7a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1068,6 +1068,16 @@ struct trace_subsystem_dir {
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event,
+				     unsigned long flags, int pc,
+				     struct pt_regs *regs);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 16:12:39 -0400
Subject: tracing: Remove unused function trace_current_buffer_discard_commit()

The function trace_current_buffer_discard_commit() has no callers, remove
it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 94e7e4d11b79..e5bdb9accf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
@@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event)
-{
-	ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
-
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-- 
cgit v1.2.3


From 33fddff24d05d71f97722cb7deec4964d39d10dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 17:44:01 -0400
Subject: tracing: Have trace_buffer_unlock_commit() call the _regs version
 with NULL

There's no real difference between trace_buffer_unlock_commit() and
trace_buffer_unlock_commit_regs() except that the former passes NULL to
ftrace_stack_trace() instead of regs. Have the former be a static inline of
the latter which passes NULL for regs.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 -----------
 kernel/trace/trace.h | 13 +++++++++----
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5bdb9accf52..41bf14412666 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1686,17 +1686,6 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
 	ring_buffer_unlock_commit(buffer, event);
 }
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc)
-{
-	__buffer_unlock_commit(buffer, event);
-
-	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
-	ftrace_trace_userstack(buffer, flags, pc);
-}
-
 static struct ring_buffer *temp_buffer;
 
 struct ring_buffer_event *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd5ae56dec7a..10156a09103f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1069,15 +1069,20 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
 				     unsigned long flags, int pc,
 				     struct pt_regs *regs);
+
+static inline void trace_buffer_unlock_commit(struct trace_array *tr,
+					      struct ring_buffer *buffer,
+					      struct ring_buffer_event *event,
+					      unsigned long flags, int pc)
+{
+	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+}
+
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From 9b9db275051cd9191e7776c4fd79ccd4318aa2dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:10:21 -0400
Subject: tracing: Remove one use of trace_current_buffer_lock_reserve()

The only user of trace_current_buffer_lock_reserve() is in the boot up self
tests. Restructure the code a little to have that code use what everything
else uses: trace_event_buffer_lock_reserve().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7cb983ee93c..da1eeb6190e3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3392,7 +3392,7 @@ static __init void event_trace_self_tests(void)
 
 static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 
-static struct trace_array *event_tr;
+static struct trace_event_file event_trace_file __initdata;
 
 static void __init
 function_test_events_call(unsigned long ip, unsigned long parent_ip,
@@ -3416,17 +3416,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 
 	local_save_flags(flags);
 
-	event = trace_current_buffer_lock_reserve(&buffer,
-						  TRACE_FN, sizeof(*entry),
-						  flags, pc);
+	event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
+						TRACE_FN, sizeof(*entry),
+						flags, pc);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
-
+	event_trigger_unlock_commit(&event_trace_file, buffer, event,
+				    entry, flags, pc);
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	preempt_enable_notrace();
@@ -3441,9 +3441,11 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
 	int ret;
-	event_tr = top_trace_array();
-	if (WARN_ON(!event_tr))
+
+	event_trace_file.tr = top_trace_array();
+	if (WARN_ON(!event_trace_file.tr))
 		return;
+
 	ret = register_ftrace_function(&trace_ops);
 	if (WARN_ON(ret < 0)) {
 		pr_info("Failed to enable function tracer for event tests\n");
-- 
cgit v1.2.3


From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:11:54 -0400
Subject: tracing: Remove unused function trace_current_buffer_lock_reserve()

trace_current_buffer_lock_reserve() has no more users. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 41bf14412666..c09e8ffadc73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc)
-{
-	*current_rb = global_trace.trace_buffer.buffer;
-	return trace_buffer_lock_reserve(*current_rb,
-					 type, len, flags, pc);
-}
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
-- 
cgit v1.2.3


From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 2 May 2016 21:30:04 -0400
Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic

Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 71 ++++++--------------------------------
 1 file changed, 10 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b3f5051cd4e9..d1d27bf37a19 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
-	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		return file->event_call->filter;
-	else
-		return file->filter;
+	return file->filter;
 }
 
 /* caller must hold event_mutex */
@@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags &= ~EVENT_FILE_FL_FILTERED;
+	file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	filter_disable(file);
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		remove_filter_string(call->filter);
-	else
-		remove_filter_string(file->filter);
+	remove_filter_string(file->filter);
 }
 
 static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 
 static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
-		__free_filter(call->filter);
-		call->filter = NULL;
-	} else {
-		__free_filter(file->filter);
-		file->filter = NULL;
-	}
+	__free_filter(file->filter);
+	file->filter = NULL;
 }
 
 static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
@@ -1718,69 +1698,38 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags |= EVENT_FILE_FL_FILTERED;
+	file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		rcu_assign_pointer(call->filter, filter);
-	else
-		rcu_assign_pointer(file->filter, filter);
+	rcu_assign_pointer(file->filter, filter);
 }
 
 static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		RCU_INIT_POINTER(call->filter, NULL);
-	else
-		RCU_INIT_POINTER(file->filter, NULL);
+	RCU_INIT_POINTER(file->filter, NULL);
 }
 
 static inline void
 event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
 event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
 event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
-	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
-	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From 0fc1b09ff1ff404ddf753f5ffa5cd0adc8fdcdc9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 3 May 2016 17:15:43 -0400
Subject: tracing: Use temp buffer when filtering events

Filtering of events requires the data to be written to the ring buffer
before it can be decided to filter or not. This is because the parameters of
the filter are based on the result that is written to the ring buffer and
not on the parameters that are passed into the trace functions.

The ftrace ring buffer is optimized for writing into the ring buffer and
committing. The discard procedure used when filtering decides the event
should be discarded is much more heavy weight. Thus, using a temporary
filter when filtering events can speed things up drastically.

Without a temp buffer we have:

 # trace-cmd start -p nop
 # perf stat -r 10 hackbench 50
       0.790706626 seconds time elapsed ( +-  0.71% )

 # trace-cmd start -e all
 # perf stat -r 10 hackbench 50
       1.566904059 seconds time elapsed ( +-  0.27% )

 # trace-cmd start -e all -f 'common_preempt_count==20'
 # perf stat -r 10 hackbench 50
       1.690598511 seconds time elapsed ( +-  0.19% )

 # trace-cmd start -e all -f 'common_preempt_count!=20'
 # perf stat -r 10 hackbench 50
       1.707486364 seconds time elapsed ( +-  0.30% )

The first run above is without any tracing, just to get a based figure.
hackbench takes ~0.79 seconds to run on the system.

The second run enables tracing all events where nothing is filtered. This
increases the time by 100% and hackbench takes 1.57 seconds to run.

The third run filters all events where the preempt count will equal "20"
(this should never happen) thus all events are discarded. This takes 1.69
seconds to run. This is 10% slower than just committing the events!

The last run enables all events and filters where the filter will commit all
events, and this takes 1.70 seconds to run. The filtering overhead is
approximately 10%. Thus, the discard and commit of an event from the ring
buffer may be about the same time.

With this patch, the numbers change:

 # trace-cmd start -p nop
 # perf stat -r 10 hackbench 50
       0.778233033 seconds time elapsed ( +-  0.38% )

 # trace-cmd start -e all
 # perf stat -r 10 hackbench 50
       1.582102692 seconds time elapsed ( +-  0.28% )

 # trace-cmd start -e all -f 'common_preempt_count==20'
 # perf stat -r 10 hackbench 50
       1.309230710 seconds time elapsed ( +-  0.22% )

 # trace-cmd start -e all -f 'common_preempt_count!=20'
 # perf stat -r 10 hackbench 50
       1.786001924 seconds time elapsed ( +-  0.20% )

The first run is again the base with no tracing.

The second run is all tracing with no filtering. It is a little slower, but
that may be well within the noise.

The third run shows that discarding all events only took 1.3 seconds. This
is a speed up of 23%! The discard is much faster than even the commit.

The one downside is shown in the last run. Events that are not discarded by
the filter will take longer to add, this is due to the extra copy of the
event.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c               | 154 +++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h               |  19 ++++-
 kernel/trace/trace_events.c        |  10 +++
 kernel/trace/trace_events_filter.c |  10 +++
 4 files changed, 185 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c09e8ffadc73..8a4bd6b68a0b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -312,7 +312,7 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 {
 	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
 	    !filter_match_preds(call->filter, rec)) {
-		ring_buffer_discard_commit(buffer, event);
+		__trace_event_discard_commit(buffer, event);
 		return 1;
 	}
 
@@ -1660,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+		  int type, unsigned long flags, int pc)
+{
+	struct trace_entry *ent = ring_buffer_event_data(event);
+
+	tracing_generic_entry_update(ent, flags, pc);
+	ent->type = type;
+}
+
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct ring_buffer *buffer,
 			  int type,
@@ -1669,21 +1679,136 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(buffer, len);
-	if (event != NULL) {
-		struct trace_entry *ent = ring_buffer_event_data(event);
+	if (event != NULL)
+		trace_event_setup(event, type, flags, pc);
+
+	return event;
+}
+
+DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DEFINE_PER_CPU(int, trace_buffered_event_cnt);
+static int trace_buffered_event_ref;
+
+/**
+ * trace_buffered_event_enable - enable buffering events
+ *
+ * When events are being filtered, it is quicker to use a temporary
+ * buffer to write the event data into if there's a likely chance
+ * that it will not be committed. The discard of the ring buffer
+ * is not as fast as committing, and is much slower than copying
+ * a commit.
+ *
+ * When an event is to be filtered, allocate per cpu buffers to
+ * write the event data into, and if the event is filtered and discarded
+ * it is simply dropped, otherwise, the entire data is to be committed
+ * in one shot.
+ */
+void trace_buffered_event_enable(void)
+{
+	struct ring_buffer_event *event;
+	struct page *page;
+	int cpu;
 
-		tracing_generic_entry_update(ent, flags, pc);
-		ent->type = type;
+	WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+	if (trace_buffered_event_ref++)
+		return;
+
+	for_each_tracing_cpu(cpu) {
+		page = alloc_pages_node(cpu_to_node(cpu),
+					GFP_KERNEL | __GFP_NORETRY, 0);
+		if (!page)
+			goto failed;
+
+		event = page_address(page);
+		memset(event, 0, sizeof(*event));
+
+		per_cpu(trace_buffered_event, cpu) = event;
+
+		preempt_disable();
+		if (cpu == smp_processor_id() &&
+		    this_cpu_read(trace_buffered_event) !=
+		    per_cpu(trace_buffered_event, cpu))
+			WARN_ON_ONCE(1);
+		preempt_enable();
 	}
 
-	return event;
+	return;
+ failed:
+	trace_buffered_event_disable();
+}
+
+static void enable_trace_buffered_event(void *data)
+{
+	/* Probably not needed, but do it anyway */
+	smp_rmb();
+	this_cpu_dec(trace_buffered_event_cnt);
+}
+
+static void disable_trace_buffered_event(void *data)
+{
+	this_cpu_inc(trace_buffered_event_cnt);
+}
+
+/**
+ * trace_buffered_event_disable - disable buffering events
+ *
+ * When a filter is removed, it is faster to not use the buffered
+ * events, and to commit directly into the ring buffer. Free up
+ * the temp buffers when there are no more users. This requires
+ * special synchronization with current events.
+ */
+void trace_buffered_event_disable(void)
+{
+	int cpu;
+
+	WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+	if (WARN_ON_ONCE(!trace_buffered_event_ref))
+		return;
+
+	if (--trace_buffered_event_ref)
+		return;
+
+	preempt_disable();
+	/* For each CPU, set the buffer as used. */
+	smp_call_function_many(tracing_buffer_mask,
+			       disable_trace_buffered_event, NULL, 1);
+	preempt_enable();
+
+	/* Wait for all current users to finish */
+	synchronize_sched();
+
+	for_each_tracing_cpu(cpu) {
+		free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
+		per_cpu(trace_buffered_event, cpu) = NULL;
+	}
+	/*
+	 * Make sure trace_buffered_event is NULL before clearing
+	 * trace_buffered_event_cnt.
+	 */
+	smp_wmb();
+
+	preempt_disable();
+	/* Do the work on each cpu */
+	smp_call_function_many(tracing_buffer_mask,
+			       enable_trace_buffered_event, NULL, 1);
+	preempt_enable();
 }
 
 void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
 	__this_cpu_write(trace_cmdline_save, true);
-	ring_buffer_unlock_commit(buffer, event);
+
+	/* If this is the temp buffer, we need to commit fully */
+	if (this_cpu_read(trace_buffered_event) == event) {
+		/* Length is in event->array[0] */
+		ring_buffer_write(buffer, event->array[0], &event->array[1]);
+		/* Release the temp buffer */
+		this_cpu_dec(trace_buffered_event_cnt);
+	} else
+		ring_buffer_unlock_commit(buffer, event);
 }
 
 static struct ring_buffer *temp_buffer;
@@ -1695,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 			  unsigned long flags, int pc)
 {
 	struct ring_buffer_event *entry;
+	int val;
 
 	*current_rb = trace_file->tr->trace_buffer.buffer;
+
+	if ((trace_file->flags &
+	     (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+	    (entry = this_cpu_read(trace_buffered_event))) {
+		/* Try to use the per cpu buffer first */
+		val = this_cpu_inc_return(trace_buffered_event_cnt);
+		if (val == 1) {
+			trace_event_setup(entry, type, flags, pc);
+			entry->array[0] = len;
+			return entry;
+		}
+		this_cpu_dec(trace_buffered_event_cnt);
+	}
+
 	entry = trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10156a09103f..5167c366d6b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1083,6 +1083,23 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
 	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
 }
 
+DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DECLARE_PER_CPU(int, trace_buffered_event_cnt);
+void trace_buffered_event_disable(void);
+void trace_buffered_event_enable(void);
+
+static inline void
+__trace_event_discard_commit(struct ring_buffer *buffer,
+			     struct ring_buffer_event *event)
+{
+	if (this_cpu_read(trace_buffered_event) == event) {
+		/* Simply release the temp buffer */
+		this_cpu_dec(trace_buffered_event_cnt);
+		return;
+	}
+	ring_buffer_discard_commit(buffer, event);
+}
+
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
@@ -1111,7 +1128,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
 	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
 	     !filter_match_preds(file->filter, entry))) {
-		ring_buffer_discard_commit(buffer, event);
+		__trace_event_discard_commit(buffer, event);
 		return true;
 	}
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index da1eeb6190e3..4d006707b947 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -363,6 +363,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 {
 	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
+	unsigned long file_flags = file->flags;
 	int ret = 0;
 	int disable;
 
@@ -445,6 +446,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		break;
 	}
 
+	/* Enable or disable use of trace_buffered_event */
+	if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
+	    (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
+		if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+			trace_buffered_event_enable();
+		else
+			trace_buffered_event_disable();
+	}
+
 	return ret;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d1d27bf37a19..9daa9b3bc6d9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -823,7 +823,12 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
+	unsigned long old_flags = file->flags;
+
 	file->flags &= ~EVENT_FILE_FL_FILTERED;
+
+	if (old_flags != file->flags)
+		trace_buffered_event_disable();
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -1698,7 +1703,12 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
+	unsigned long old_flags = file->flags;
+
 	file->flags |= EVENT_FILE_FL_FILTERED;
+
+	if (old_flags != file->flags)
+		trace_buffered_event_enable();
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
-- 
cgit v1.2.3


From 470bf1f27a1472264d18c84b324389509f0e30b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Thu, 24 Mar 2016 02:46:33 +0100
Subject: seccomp: Fix comment typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop accidentally repeated word in comment.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e1e5a354854e..6c9bb62ed046 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -915,7 +915,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
 
 	fprog = filter->prog->orig_prog;
 	if (!fprog) {
-		/* This must be a new non-cBPF filter, since we save every
+		/* This must be a new non-cBPF filter, since we save
 		 * every cBPF filter's orig_prog above when
 		 * CONFIG_CHECKPOINT_RESTORE is enabled.
 		 */
-- 
cgit v1.2.3


From cc622420798c4bcf093785d872525087a7798db9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 25 Apr 2016 17:35:29 +0200
Subject: gcov: disable for COMPILE_TEST

Enabling gcov is counterproductive to compile testing: it significantly
increases the kernel image size, compile time, and it produces lots
of false positive "may be used uninitialized" warnings as the result
of missed optimizations.

This is in line with how UBSAN_SANITIZE_ALL and PROFILE_ALL_BRANCHES
work, both of which have similar problems.

With an ARM allmodconfig kernel, I see the build time drop from
283 minutes CPU time to 225 minutes, and the vmlinux size drops
from 43MB to 26MB.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Michal Marek <mmarek@suse.com>
---
 kernel/gcov/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index c92e44855ddd..1276aabaab55 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
 
 config GCOV_PROFILE_ALL
 	bool "Profile entire Kernel"
+	depends on !COMPILE_TEST
 	depends on GCOV_KERNEL
 	depends on ARCH_HAS_GCOV_PROFILE_ALL
 	default n
-- 
cgit v1.2.3


From c983f0e8677d5d686ea81de13f9c9ac34f21a2a7 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 29 Mar 2016 09:35:32 +0100
Subject: seccomp: Get compat syscalls from asm-generic header

Move retrieval of compat syscall numbers into inline function defined in
asm-generic header so that arches may override it.

[ralf@linux-mips.org: Resolve merge conflict.]

Suggested-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12978/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 kernel/seccomp.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e1e5a354854e..737436ebb4fe 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -518,19 +518,12 @@ static int mode1_syscalls[] = {
 	0, /* null terminated */
 };
 
-#ifdef CONFIG_COMPAT
-static int mode1_syscalls_32[] = {
-	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
-	0, /* null terminated */
-};
-#endif
-
 static void __secure_computing_strict(int this_syscall)
 {
 	int *syscall_whitelist = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
-		syscall_whitelist = mode1_syscalls_32;
+		syscall_whitelist = get_compat_mode1_syscalls();
 #endif
 	do {
 		if (*syscall_whitelist == this_syscall)
-- 
cgit v1.2.3


From cb4253aa0f77f20be018970dbe5d01d78b930ef9 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 29 Mar 2016 09:35:34 +0100
Subject: secomp: Constify mode1 syscall whitelist

These values are constant and should be marked as such.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Will Drewry <wad@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12979/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 kernel/seccomp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 737436ebb4fe..a0ffcb1a2bee 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -513,14 +513,14 @@ static void seccomp_send_sigsys(int syscall, int reason)
  * To be fully secure this must be combined with rlimit
  * to limit the stack allocations too.
  */
-static int mode1_syscalls[] = {
+static const int mode1_syscalls[] = {
 	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
 	0, /* null terminated */
 };
 
 static void __secure_computing_strict(int this_syscall)
 {
-	int *syscall_whitelist = mode1_syscalls;
+	const int *syscall_whitelist = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
 		syscall_whitelist = get_compat_mode1_syscalls();
-- 
cgit v1.2.3


From a831100aeefbe6d9f3e47a3e2712f82c042f1f5c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 16:34:53 -0300
Subject: perf core: Generalize max_stack sysctl handler

So that it can be used for other stack related knobs, such as the
upcoming one to tweak the max number of of contexts per stack sample.

In all those cases we can only change the value if there are no perf
sessions collecting stacks, so they need to grab that mutex, etc.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-8t3fk94wuzp8m2z1n4gc0s17@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 5 +++--
 kernel/sysctl.c           | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b9325e7dcba1..7fc89939ede9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -228,7 +228,8 @@ exit_put:
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int new_value = sysctl_perf_event_max_stack, ret;
+	int *value = table->data;
+	int new_value = *value, ret;
 	struct ctl_table new_table = *table;
 
 	new_table.data = &new_value;
@@ -240,7 +241,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
 	if (atomic_read(&nr_callchain_events))
 		ret = -EBUSY;
 	else
-		sysctl_perf_event_max_stack = new_value;
+		*value = new_value;
 
 	mutex_unlock(&callchain_mutex);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..0ec6907a16b3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1149,7 +1149,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "perf_event_max_stack",
-		.data		= NULL, /* filled in by handler */
+		.data		= &sysctl_perf_event_max_stack,
 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-- 
cgit v1.2.3


From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 Apr 2016 12:30:53 -0300
Subject: perf core: Pass max stack as a perf_callchain_entry context

This makes perf_callchain_{user,kernel}() receive the max stack
as context for the perf_callchain_entry, instead of accessing
the global sysctl_perf_event_max_stack.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/bpf/stackmap.c     |  3 ++-
 kernel/events/callchain.c | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f5a19548be12..a82d7605db3f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 7fc89939ede9..af95ad92893a 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 				  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				struct pt_regs *regs)
 {
 }
@@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
+	struct perf_callchain_entry_ctx ctx;
 	int rctx;
 
 	entry = get_callchain_entry(&rctx);
@@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	if (!entry)
 		goto exit_put;
 
+	ctx.entry     = entry;
+	ctx.max_stack = max_stack;
+
 	entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(entry, regs);
+			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
@@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(entry, PERF_CONTEXT_USER);
-			perf_callchain_user(entry, regs);
+				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+			perf_callchain_user(&ctx, regs);
 		}
 	}
 
-- 
cgit v1.2.3


From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 18:08:32 -0300
Subject: perf core: Add a 'nr' field to perf_event_callchain_context

We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.

This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index af95ad92893a..8774ff86debb 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
-
-	entry->nr = init_nr;
+	ctx.nr	      = entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-- 
cgit v1.2.3


From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:01:50 -0300
Subject: perf core: Add perf_callchain_store_context() helper

We need have different helpers to account how many contexts we have in
the sample and for real addresses, so do it now as a prep patch, to
ease review.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8774ff86debb..ca645736a983 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(&ctx, regs);
 	}
 
@@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 			perf_callchain_user(&ctx, regs);
 		}
 	}
-- 
cgit v1.2.3


From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:06:21 -0300
Subject: perf core: Separate accounting of contexts and real addresses in a
 stack trace

The perf_sample->ip_callchain->nr value includes all the entries in the
ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc},
while what the user expects is that what is in the kernel.perf_event_max_stack
sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be
honoured in terms of IP addresses in the stack trace.

So allocate a bunch of extra entries for contexts, and do the accounting
via perf_callchain_entry_ctx struct members.

A new sysctl, kernel.perf_event_max_contexts_per_stack is also
introduced for investigating possible bugs in the callchain
implementation by some arch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 10 +++++++++-
 kernel/sysctl.c           |  9 +++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index ca645736a983..179ef4640964 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
 	return (sizeof(struct perf_callchain_entry) +
-		sizeof(__u64) * sysctl_perf_event_max_stack);
+		sizeof(__u64) * (sysctl_perf_event_max_stack +
+				 sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
 	ctx.nr	      = entry->nr = init_nr;
+	ctx.contexts       = 0;
+	ctx.contexts_maxed = false;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
@@ -228,6 +232,10 @@ exit_put:
 	return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ec6907a16b3..bec4c11c47d6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &six_hundred_forty_kb,
 	},
+	{
+		.procname	= "perf_event_max_contexts_per_stack",
+		.data		= &sysctl_perf_event_max_contexts_per_stack,
+		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &one_thousand,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.2.3


From 60f05e86cf3e8c5f379fe5ba94634fcec17dd67e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 18 May 2016 17:55:28 +0530
Subject: cpufreq: schedutil: Improve prints messages with pr_fmt

Prefix print messages with KBUILD_MODNAME, i.e 'cpufreq_schedutil: '.
This helps to keep similar formatting for all the print messages
particular to a file and identify those easily in kernel logs.

Its already done this way for rest of the governors.

Along with that, remove the (now) redundant bits from a print message.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 154ae3a51e86..14c4aa25cc45 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cpufreq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-	pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+	pr_err("initialization failed (error %d)\n", ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:02 -0700
Subject: time: add missing implementation for timespec64_add_safe()

timespec64_add_safe() has been defined in time64.h for 64 bit systems.
But, 32 bit systems only have an extern function prototype defined.
Provide a definition for the above function.

The function will be necessary as part of y2038 changes.  struct
timespec is not y2038 safe.  All references to timespec will be replaced
by struct timespec64.  The function is meant to be a replacement for
timespec_add_safe().

The implementation is similar to timespec_add_safe().

Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/time.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..cb1f83eb5599 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 
 	return res;
 }
+
+#if __BITS_PER_LONG != 64
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+				const struct timespec64 rhs)
+{
+	struct timespec64 res;
+
+	set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+			lhs.tv_nsec + rhs.tv_nsec);
+
+	if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+		res.tv_sec = TIME64_MAX;
+		res.tv_nsec = 0;
+	}
+
+	return res;
+}
+
+#endif
-- 
cgit v1.2.3


From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:08 -0700
Subject: time: remove timespec_add_safe()

All references to timespec_add_safe() now use timespec64_add_safe().

The plan is to replace struct timespec references with struct timespec64
throughout the kernel as timespec is not y2038 safe.

Drop timespec_add_safe() and use timespec64_add_safe() for all
architectures.

Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/time.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index cb1f83eb5599..667b9335f5d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 	return res;
 }
 
-#if __BITS_PER_LONG != 64
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
@@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 
 	return res;
 }
-
-#endif
-- 
cgit v1.2.3


From 02a982a6ec631d871571f940ca13817551759884 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:26 -0700
Subject: workqueue: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
change (debugobjects: make fixup functions return bool instead of int)

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f5068e94003..6751b18fd9ac 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -437,7 +437,7 @@ static void *work_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -445,9 +445,9 @@ static int work_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		cancel_work_sync(work);
 		debug_object_init(work, &work_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -456,7 +456,7 @@ static int work_fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -471,16 +471,16 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
 		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 			debug_object_init(work, &work_debug_descr);
 			debug_object_activate(work, &work_debug_descr);
-			return 0;
+			return false;
 		}
 		WARN_ON_ONCE(1);
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -488,7 +488,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -496,9 +496,9 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		cancel_work_sync(work);
 		debug_object_free(work, &work_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From e3252464da222ef2c2ca52dff383a824080ea3d5 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:29 -0700
Subject: timer: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
cheange (debugobjects: make fixup functions return bool instead of int).

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/hrtimer.c | 18 +++++++++---------
 kernel/time/timer.c   | 30 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa0b983290cf..f962a58c0957 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct hrtimer *timer = addr;
 
@@ -342,9 +342,9 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		hrtimer_cancel(timer);
 		debug_object_init(timer, &hrtimer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -353,19 +353,19 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
 
 	case ODEBUG_STATE_NOTAVAILABLE:
 		WARN_ON_ONCE(1);
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -373,7 +373,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct hrtimer *timer = addr;
 
@@ -381,9 +381,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		hrtimer_cancel(timer);
 		debug_object_free(timer, &hrtimer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 73164c3aa56b..be33481a4da1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -493,7 +493,7 @@ static void *timer_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -501,9 +501,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		del_timer_sync(timer);
 		debug_object_init(timer, &timer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -518,7 +518,7 @@ static void stub_timer(unsigned long data)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -534,18 +534,18 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 		    timer->entry.next == TIMER_ENTRY_STATIC) {
 			debug_object_init(timer, &timer_debug_descr);
 			debug_object_activate(timer, &timer_debug_descr);
-			return 0;
+			return false;
 		} else {
 			setup_timer(timer, stub_timer, 0);
-			return 1;
+			return true;
 		}
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -553,7 +553,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -561,9 +561,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		del_timer_sync(timer);
 		debug_object_free(timer, &timer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -571,7 +571,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
  * fixup_assert_init is called when:
  * - an untracked/uninit-ed object is found
  */
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -584,13 +584,13 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 			 * is tracked in the object tracker.
 			 */
 			debug_object_init(timer, &timer_debug_descr);
-			return 0;
+			return false;
 		} else {
 			setup_timer(timer, stub_timer, 0);
-			return 1;
+			return true;
 		}
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From 3263d28eb5b93b3c1b024366df87b1d0c774228f Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:32 -0700
Subject: rcu: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
cheange (debugobjects: make fixup functions return bool instead of int).

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcu/update.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ccdc8eebc5a..a9df198eb22d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -386,7 +386,7 @@ void destroy_rcu_head(struct rcu_head *head)
  * - an unknown object is activated (might be a statically initialized object)
  * Activation is performed internally by call_rcu().
  */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct rcu_head *head = addr;
 
@@ -399,9 +399,9 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
 		 */
 		debug_object_init(head, &rcuhead_debug_descr);
 		debug_object_activate(head, &rcuhead_debug_descr);
-		return 0;
+		return false;
 	default:
-		return 1;
+		return true;
 	}
 }
 
-- 
cgit v1.2.3


From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:41 -0700
Subject: debugobjects: insulate non-fixup logic related to static obj from
 fixup callbacks

When activating a static object we need make sure that the object is
tracked in the object tracker.  If it is a non-static object then the
activation is illegal.

In previous implementation, each subsystem need take care of this in
their fixup callbacks.  Actually we can put it into debugobjects core.
Thus we can save duplicated code, and have *pure* fixup callbacks.

To achieve this, a new callback "is_static_object" is introduced to let
the type specific code decide whether a object is static or not.  If
yes, we take it into object tracker, otherwise give warning and invoke
fixup callback.

This change has paassed debugobjects selftest, and I also do some test
with all debugobjects supports enabled.

At last, I have a concern about the fixups that can it change the object
which is in incorrect state on fixup? Because the 'addr' may not point
to any valid object if a non-static object is not tracked.  Then Change
such object can overwrite someone's memory and cause unexpected
behaviour.  For example, the timer_fixup_activate bind timer to function
stub_timer.

Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com
[changbin.du@intel.com: improve code comments where invoke the new is_static_object callback]
  Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com
Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcu/update.c   | 26 +++-----------------------
 kernel/time/hrtimer.c |  7 +------
 kernel/time/timer.c   | 43 ++++++++++++++-----------------------------
 kernel/workqueue.c    | 42 ++++++++----------------------------------
 4 files changed, 26 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a9df198eb22d..3e888cd5a594 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
 	debug_object_free(head, &rcuhead_debug_descr);
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
 {
-	struct rcu_head *head = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. We just make sure that it is
-		 * tracked in the object tracker.
-		 */
-		debug_object_init(head, &rcuhead_debug_descr);
-		debug_object_activate(head, &rcuhead_debug_descr);
-		return false;
-	default:
-		return true;
-	}
+	return true;
 }
 
 /**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
 
 struct debug_obj_descr rcuhead_debug_descr = {
 	.name = "rcu_head",
-	.fixup_activate = rcuhead_fixup_activate,
+	.is_static_object = rcuhead_is_static_object,
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f962a58c0957..8c7392c4fdbd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		WARN_ON_ONCE(1);
-		return false;
-
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index be33481a4da1..3a95f9728778 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr)
 	return ((struct timer_list *) addr)->function;
 }
 
+static bool timer_is_static_object(void *addr)
+{
+	struct timer_list *timer = addr;
+
+	return (timer->entry.pprev == NULL &&
+		timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -516,30 +524,16 @@ static void stub_timer(unsigned long data)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
 	switch (state) {
-
 	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The timer was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (timer->entry.pprev == NULL &&
-		    timer->entry.next == TIMER_ENTRY_STATIC) {
-			debug_object_init(timer, &timer_debug_descr);
-			debug_object_activate(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
-		return false;
+		setup_timer(timer, stub_timer, 0);
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
@@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.next == TIMER_ENTRY_STATIC) {
-			/*
-			 * This is not really a fixup. The timer was
-			 * statically initialized. We just make sure that it
-			 * is tracked in the object tracker.
-			 */
-			debug_object_init(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
+		setup_timer(timer, stub_timer, 0);
+		return true;
 	default:
 		return false;
 	}
@@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
 	.name			= "timer_list",
 	.debug_hint		= timer_debug_hint,
+	.is_static_object	= timer_is_static_object,
 	.fixup_init		= timer_fixup_init,
 	.fixup_activate		= timer_fixup_activate,
 	.fixup_free		= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6751b18fd9ac..e1c0e996b5ae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr)
 	return ((struct work_struct *) addr)->func;
 }
 
+static bool work_is_static_object(void *addr)
+{
+	struct work_struct *work = addr;
+
+	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state)
 	}
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- */
-static bool work_fixup_activate(void *addr, enum debug_obj_state state)
-{
-	struct work_struct *work = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The work struct was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
-			debug_object_init(work, &work_debug_descr);
-			debug_object_activate(work, &work_debug_descr);
-			return false;
-		}
-		WARN_ON_ONCE(1);
-		return false;
-
-	case ODEBUG_STATE_ACTIVE:
-		WARN_ON(1);
-
-	default:
-		return false;
-	}
-}
-
 /*
  * fixup_free is called when:
  * - an active object is freed
@@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
 	.debug_hint	= work_debug_hint,
+	.is_static_object = work_is_static_object,
 	.fixup_init	= work_fixup_init,
-	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
 };
 
-- 
cgit v1.2.3


From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Thu, 19 May 2016 17:09:56 -0700
Subject: kernel/padata.c: removed unused code

By accident I stumbled across code that has never been used.  This
driver has EXPORT_SYMBOL functions, and the only user of the code is
pcrypt.c, but this only uses a subset of the exported symbols.

According to 'git log -G', the functions, padata_set_cpumasks,
padata_add_cpu, and padata_remove_cpu have never been used since they
were first introduced.  This patch removes the unused code.

On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than
4k smaller.

  kbuild_hp> size $KBUILD_OUTPUT/vmlinux
      text    data     bss      dec hex    filename
  10566658 4678360 1122304 16367322 f9beda vmlinux
  10561984 4678360 1122304 16362648 f9ac98 vmlinux

On another config, 32 bit, the saving is about 0.5k bytes.

  kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux
  6012005 2409513 2785280 11206798 ab008e vmlinux
  6011491 2409513 2785280 11206284 aafe8c vmlinux

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/padata.c | 64 ---------------------------------------------------------
 1 file changed, 64 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..67ddd4acde9d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -606,33 +606,6 @@ out_replace:
 	return 0;
 }
 
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                       one is used by parallel workers and the second one
- *                       by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
-			cpumask_var_t cbcpumask)
-{
-	int err;
-
-	mutex_lock(&pinst->lock);
-	get_online_cpus();
-
-	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
-	put_online_cpus();
-	mutex_unlock(&pinst->lock);
-
-	return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
 /**
  * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
  *                     equivalent to @cpumask.
@@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- *                  padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- *        The @mask may be any combination of the following flags:
- *          PADATA_CPU_SERIAL   - serial cpumask
- *          PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
-	int err;
-
-	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
-		return -EINVAL;
-
-	mutex_lock(&pinst->lock);
-
-	get_online_cpus();
-	if (mask & PADATA_CPU_SERIAL)
-		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
-	if (mask & PADATA_CPU_PARALLEL)
-		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
-	err = __padata_add_cpu(pinst, cpu);
-	put_online_cpus();
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
 static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd = NULL;
@@ -1091,7 +1028,6 @@ err_free_inst:
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3


From 19d795b677bda354644cfb87a196b087fdc2a965 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 May 2016 17:09:59 -0700
Subject: kernel/padata.c: hide unused functions

A recent cleanup removed some exported functions that were not used
anywhere, which in turn exposed the fact that some other functions in
the same file are only used in some configurations.

We now get a warning about them when CONFIG_HOTPLUG_CPU is disabled:

  kernel/padata.c:670:12: error: '__padata_remove_cpu' defined but not used [-Werror=unused-function]
   static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
              ^~~~~~~~~~~~~~~~~~~
  kernel/padata.c:650:12: error: '__padata_add_cpu' defined but not used [-Werror=unused-function]
   static int __padata_add_cpu(struct padata_instance *pinst, int cpu)

This rearranges the code so the __padata_remove_cpu/__padata_add_cpu
functions are within the #ifdef that protects the code that calls them.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: 4ba6d78c671e ("kernel/padata.c: removed unused code")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/padata.c | 74 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 67ddd4acde9d..993278895ccc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -647,6 +647,43 @@ out:
 }
 EXPORT_SYMBOL(padata_set_cpumask);
 
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+	int err = 0;
+
+	mutex_lock(&pinst->lock);
+
+	if (pinst->flags & PADATA_INVALID)
+		err = -EINVAL;
+
+	 __padata_start(pinst);
+
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+	mutex_lock(&pinst->lock);
+	__padata_stop(pinst);
+	mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
 static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd;
@@ -726,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
 }
 EXPORT_SYMBOL(padata_remove_cpu);
 
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
-	int err = 0;
-
-	mutex_lock(&pinst->lock);
-
-	if (pinst->flags & PADATA_INVALID)
-		err =-EINVAL;
-
-	 __padata_start(pinst);
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
-	mutex_lock(&pinst->lock);
-	__padata_stop(pinst);
-	mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
 static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
 {
 	return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..1c03dfb4abfd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_STRUCT_SIZE(list_head);
 	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, _refcount);
 	VMCOREINFO_OFFSET(page, mapping);
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..611cc69af8f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 
 static int cpuset_spread_node(int *rotor)
 {
-	int node;
-
-	node = next_node(*rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	*rotor = node;
-	return node;
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
 }
 
 int cpuset_mem_spread_node(void)
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "stat_refresh",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0600,
+		.proc_handler	= vmstat_refresh,
+	},
 #endif
 #ifdef CONFIG_MMU
 	{
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 611cc69af8f0..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /* See "Frequency meter" comments, below. */
 
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
-		return 1;
+		return true;
 	if (node_isset(node, current->mems_allowed))
-		return 1;
+		return true;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return 1;
+		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return 0;
+		return false;
 
 	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
+		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
-- 
cgit v1.2.3


From 6112a300c9e41993cc0dc56ac393743d28381284 Mon Sep 17 00:00:00 2001
From: Soumya PN <soumya.p.n@hpe.com>
Date: Tue, 17 May 2016 21:31:14 +0530
Subject: ftrace: Don't disable irqs when taking the tasklist_lock read_lock

In ftrace.c inside the function alloc_retstack_tasklist() (which will be
invoked when function_graph tracing is on) the tasklist_lock is being
held as reader while iterating through a list of threads. Here the lock
is being held as reader with irqs disabled. The tasklist_lock is never
write_locked in interrupt context so it is safe to not disable interrupts
for the duration of read_lock in this block which, can be significant,
given the block of code iterates through all threads. Hence changing the
code to call read_lock() and read_unlock() instead of read_lock_irqsave()
and read_unlock_irqrestore().

A similar change was made in commits: 8063e41d2ffc ("tracing: Change
syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread()")'
and 3472eaa1f12e ("sched: normalize_rt_tasks(): Don't use _irqsave for
tasklist_lock, use task_rq_lock()")'

Link: http://lkml.kernel.org/r/1463500874-77480-1-git-send-email-soumya.p.n@hpe.com

Signed-off-by: Soumya PN <soumya.p.n@hpe.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1870fbd2b67..a6804823a058 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5713,7 +5713,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 {
 	int i;
 	int ret = 0;
-	unsigned long flags;
 	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
 	struct task_struct *g, *t;
 
@@ -5729,7 +5728,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 	}
 
-	read_lock_irqsave(&tasklist_lock, flags);
+	read_lock(&tasklist_lock);
 	do_each_thread(g, t) {
 		if (start == end) {
 			ret = -EAGAIN;
@@ -5747,7 +5746,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 	} while_each_thread(g, t);
 
 unlock:
-	read_unlock_irqrestore(&tasklist_lock, flags);
+	read_unlock(&tasklist_lock);
 free:
 	for (i = start; i < end; i++)
 		kfree(ret_stack_list[i]);
-- 
cgit v1.2.3


From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 May 2016 14:14:28 +0200
Subject: bpf: rather use get_random_int for randomizations

Start address randomization and blinding in BPF currently use
prandom_u32(). prandom_u32() values are not exposed to unpriviledged
user space to my knowledge, but given other kernel facilities such as
ASLR, stack canaries, etc make use of stronger get_random_int(), we
better make use of it here as well given blinding requests successively
new random values. get_random_int() has minimal entropy pool depletion,
is not cryptographically secure, but doesn't need to be for our use
cases here.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f1e8a0def99b..b94a36550591 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	hdr->pages = size / PAGE_SIZE;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
-	start = (prandom_u32() % hole) & ~(alignment - 1);
+	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	/* Leave a random number of instructions before BPF code. */
 	*image_ptr = &hdr->image[start];
@@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      struct bpf_insn *to_buff)
 {
 	struct bpf_insn *to = to_buff;
-	u32 imm_rnd = prandom_u32();
+	u32 imm_rnd = get_random_int();
 	s16 off;
 
 	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
-- 
cgit v1.2.3


From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 May 2016 17:22:48 -0500
Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem

While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the
bpf filesystem.  Looking at the code I saw a broken usage of mount_ns
with current->nsproxy->mnt_ns. As the code does not acquire a
reference to the mount namespace it can not possibly be correct to
store the mount namespace on the superblock as it does.

Replace mount_ns with mount_nodev so that each mount of the bpf
filesystem returns a distinct instance, and the code is not buggy.

In discussion with Hannes Frederic Sowa it was reported that the use
of mount_ns was an attempt to have one bpf instance per mount
namespace, in an attempt to keep resources that pin resources from
hiding.  That intent simply does not work, the vfs is not built to
allow that kind of behavior.  Which means that the bpf filesystem
really is buggy both semantically and in it's implemenation as it does
not nor can it implement the original intent.

This change is userspace visible, but my experience with similar
filesystems leads me to believe nothing will break with a model of each
mount of the bpf filesystem is distinct from all others.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 71b75d9c81da..04be7021f848 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 static struct dentry *bpf_mount(struct file_system_type *type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+	return mount_nodev(type, flags, data, bpf_fill_super);
 }
 
 static struct file_system_type bpf_fs_type = {
-- 
cgit v1.2.3


From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:13 -0700
Subject: bpf: support decreasing order in direct packet access

when packet headers are accessed in 'decreasing' order (like TCP port
may be fetched before the program reads IP src) the llvm may generate
the following code:
[...]                // R7=pkt(id=0,off=22,r=70)
r2 = *(u32 *)(r7 +0) // good access
[...]
r7 += 40             // R7=pkt(id=0,off=62,r=70)
r8 = *(u32 *)(r7 +0) // good access
[...]
r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range
                      // it's doing *(u32*)(skb->data + 42)
Fix verifier to recognize such code pattern

Alos turned out that 'off > range' condition is not a verifier bug.
It's a buggy program that may do something like:
if (ptr + 50 > data_end)
  return 0;
ptr += 60;
*(u32*)ptr;
in such case emit
"invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message,
so all information is available for the program author to fix the program.

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a08d66215245..d54e34874579 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 {
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *reg = &regs[regno];
-	int linear_size = (int) reg->range - (int) reg->off;
 
-	if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
-		verbose("verifier bug\n");
-		return -EFAULT;
-	}
-	if (off < 0 || off + size > linear_size) {
-		verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
-			off, size, linear_size);
+	off += reg->off;
+	if (off < 0 || off + size > reg->range) {
+		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
 	return 0;
-- 
cgit v1.2.3


From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:14 -0700
Subject: bpf: teach verifier to recognize imm += ptr pattern

Humans don't write C code like:
  u8 *ptr = skb->data;
  int imm = 4;
  imm += ptr;
but from llvm backend point of view 'imm' and 'ptr' are registers and
imm += ptr may be preferred vs ptr += imm depending which register value
will be used further in the code, while verifier can only recognize ptr += imm.
That caused small unrelated changes in the C code of the bpf program to
trigger rejection by the verifier. Therefore teach the verifier to recognize
both ptr += imm and imm += ptr.
For example:
when R6=pkt(id=0,off=0,r=62) R7=imm22
after r7 += r6 instruction
will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62)

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d54e34874579..668e07903c8f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *dst_reg = &regs[insn->dst_reg];
 	struct reg_state *src_reg = &regs[insn->src_reg];
+	struct reg_state tmp_reg;
 	s32 imm;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
@@ -1267,6 +1268,19 @@ add_imm:
 		 */
 		dst_reg->off += imm;
 	} else {
+		if (src_reg->type == PTR_TO_PACKET) {
+			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
+			tmp_reg = *dst_reg;  /* save r7 state */
+			*dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
+			src_reg = &tmp_reg;  /* pretend it's src_reg state */
+			/* if the checks below reject it, the copy won't matter,
+			 * since we're rejecting the whole program. If all ok,
+			 * then imm22 state will be added to r7
+			 * and r7 will be pkt(id=0,off=22,r=62) while
+			 * r6 will stay as pkt(id=0,off=0,r=62)
+			 */
+		}
+
 		if (src_reg->type == CONST_IMM) {
 			/* pkt_ptr += reg where reg is known constant */
 			imm = src_reg->imm;
@@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return 0;
 		} else if (opcode == BPF_ADD &&
 			   BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == PTR_TO_PACKET) {
+			   (dst_reg->type == PTR_TO_PACKET ||
+			    (BPF_SRC(insn->code) == BPF_X &&
+			     regs[insn->src_reg].type == PTR_TO_PACKET))) {
 			/* ptr_to_packet += K|X */
 			return check_packet_ptr_add(env, insn);
 		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
-- 
cgit v1.2.3


From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: exit_thread: accept a task parameter to be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..75b34fe835b2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
-- 
cgit v1.2.3


From 0740aa5f6375681c57488c4ea55d05a0341cfc9c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:25 -0700
Subject: fork: free thread in copy_process on failure

When using this program (as root):

	#include <err.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>

	#include <sys/io.h>
	#include <sys/types.h>
	#include <sys/wait.h>

	#define ITER 1000
	#define FORKERS 15
	#define THREADS (6000/FORKERS) // 1850 is proc max

	static void fork_100_wait()
	{
		unsigned a, to_wait = 0;

		printf("\t%d forking %d\n", THREADS, getpid());

		for (a = 0; a < THREADS; a++) {
			switch (fork()) {
			case 0:
				usleep(1000);
				exit(0);
				break;
			case -1:
				break;
			default:
				to_wait++;
				break;
			}
		}

		printf("\t%d forked from %d, waiting for %d\n", THREADS, getpid(),
				to_wait);

		for (a = 0; a < to_wait; a++)
			wait(NULL);

		printf("\t%d waited from %d\n", THREADS, getpid());
	}

	static void run_forkers()
	{
		pid_t forkers[FORKERS];
		unsigned a;

		for (a = 0; a < FORKERS; a++) {
			switch ((forkers[a] = fork())) {
			case 0:
				fork_100_wait();
				exit(0);
				break;
			case -1:
				err(1, "DIE fork of %d'th forker", a);
				break;
			default:
				break;
			}
		}

		for (a = 0; a < FORKERS; a++)
			waitpid(forkers[a], NULL, 0);
	}

	int main()
	{
		unsigned a;
		int ret;

		ret = ioperm(10, 20, 0);
		if (ret < 0)
			err(1, "ioperm");

		for (a = 0; a < ITER; a++)
			run_forkers();

		return 0;
	}

kmemleak reports many occurences of this leak:
unreferenced object 0xffff8805917c8000 (size 8192):
  comm "fork-leak", pid 2932, jiffies 4295354292 (age 1871.028s)
  hex dump (first 32 bytes):
    ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff  ................
    ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff  ................
  backtrace:
    [<ffffffff814cfbf5>] kmemdup+0x25/0x50
    [<ffffffff8103ab43>] copy_thread_tls+0x6c3/0x9a0
    [<ffffffff81150174>] copy_process+0x1a84/0x5790
    [<ffffffff811dc375>] wake_up_new_task+0x2d5/0x6f0
    [<ffffffff8115411d>] _do_fork+0x12d/0x820
...

Due to the leakage of the memory items which should have been freed in
arch/x86/kernel/process.c:exit_thread().

Make sure the memory is freed when fork fails later in copy_process.
This is done by calling exit_thread with the thread to kill.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8fbed7194af1..103d78fd8f75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1490,7 +1490,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
-			goto bad_fork_cleanup_io;
+			goto bad_fork_cleanup_thread;
 		}
 	}
 
@@ -1652,6 +1652,8 @@ bad_fork_cancel_cgroup:
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_thread:
+	exit_thread(p);
 bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
-- 
cgit v1.2.3


From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:33 -0700
Subject: printk/nmi: generic solution for safe printk in NMI

printk() takes some locks and could not be used a safe way in NMI
context.

The chance of a deadlock is real especially when printing stacks from
all CPUs.  This particular problem has been addressed on x86 by the
commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all
CPUs").

The patchset brings two big advantages.  First, it makes the NMI
backtraces safe on all architectures for free.  Second, it makes all NMI
messages almost safe on all architectures (the temporary buffer is
limited.  We still should keep the number of messages in NMI context at
minimum).

Note that there already are several messages printed in NMI context:
WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE
handlers.  These are not easy to avoid.

This patch reuses most of the code and makes it generic.  It is useful
for all messages and architectures that support NMI.

The alternative printk_func is set when entering and is reseted when
leaving NMI context.  It queues IRQ work to copy the messages into the
main ring buffer in a safe context.

__printk_nmi_flush() copies all available messages and reset the buffer.
Then we could use a simple cmpxchg operations to get synchronized with
writers.  There is also used a spinlock to get synchronized with other
flushers.

We do not longer use seq_buf because it depends on external lock.  It
would be hard to make all supported operations safe for a lockless use.
It would be confusing and error prone to make only some operations safe.

The code is put into separate printk/nmi.c as suggested by Steven
Rostedt.  It needs a per-CPU buffer and is compiled only on
architectures that call nmi_enter().  This is achieved by the new
HAVE_NMI Kconfig flag.

The are MN10300 and Xtensa architectures.  We need to clean up NMI
handling there first.  Let's do it separately.

The patch is heavily based on the draft from Peter Zijlstra, see

  https://lkml.org/lkml/2015/6/10/327

[arnd@arndb.de: printk-nmi: use %zu format string for size_t]
[akpm@linux-foundation.org: min_t->min - all types are size_t here]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>	[arm part]
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/Makefile   |   1 +
 kernel/printk/internal.h |  44 ++++++++++
 kernel/printk/nmi.c      | 219 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c   |  19 +---
 4 files changed, 265 insertions(+), 18 deletions(-)
 create mode 100644 kernel/printk/internal.h
 create mode 100644 kernel/printk/nmi.c

(limited to 'kernel')

diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
 obj-y	= printk.o
+obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..2de99faedfc1
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,44 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..303cf0d15e57
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,219 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+
+#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	if (len >= sizeof(s->buffer))
+		return 0;
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_nmi_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	unsigned long flags;
+	size_t len, size;
+	int i, last_i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase.
+	 */
+	if (i && i >= len)
+		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+		       i, len);
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+
+	size = min(len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_nmi_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_nmi_seq_line(s, last_i, size - 1);
+		pr_cont("\n");
+	}
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_nmi_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..71eba0607034 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
 
 #include "console_cmdline.h"
 #include "braille.h"
+#include "internal.h"
 
 int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
@@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
-- 
cgit v1.2.3


From b522deabc6f18e4f938d93a84f345f2cbf3347d1 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:36 -0700
Subject: printk/nmi: warn when some message has been lost in NMI context

We could not resize the temporary buffer in NMI context.  Let's warn if
a message is lost.

This is rather theoretical.  printk() should not be used in NMI.  The
only sensible use is when we want to print backtrace from all CPUs.  The
current buffer should be enough for this purpose.

[akpm@linux-foundation.org: whitespace fixlet]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/internal.h | 11 +++++++++++
 kernel/printk/nmi.c      |  5 ++++-
 kernel/printk/printk.c   | 10 ++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2de99faedfc1..341bedccc065 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -34,6 +34,12 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 	return this_cpu_read(printk_func)(fmt, args);
 }
 
+extern atomic_t nmi_message_lost;
+static inline int get_nmi_message_lost(void)
+{
+	return atomic_xchg(&nmi_message_lost, 0);
+}
+
 #else /* CONFIG_PRINTK_NMI */
 
 static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
@@ -41,4 +47,9 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 	return vprintk_default(fmt, args);
 }
 
+static inline int get_nmi_message_lost(void)
+{
+	return 0;
+}
+
 #endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 303cf0d15e57..572f94922230 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -39,6 +39,7 @@
  */
 DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_nmi_irq_ready;
+atomic_t nmi_message_lost;
 
 #define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
 
@@ -64,8 +65,10 @@ static int vprintk_nmi(const char *fmt, va_list args)
 again:
 	len = atomic_read(&s->len);
 
-	if (len >= sizeof(s->buffer))
+	if (len >= sizeof(s->buffer)) {
+		atomic_inc(&nmi_message_lost);
 		return 0;
+	}
 
 	/*
 	 * Make sure that all old data have been read before the buffer was
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 71eba0607034..e38579d730f4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1617,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 	unsigned long flags;
 	int this_cpu;
 	int printed_len = 0;
+	int nmi_message_lost;
 	bool in_sched = false;
 	/* cpu currently holding logbuf_lock in this function */
 	static unsigned int logbuf_cpu = UINT_MAX;
@@ -1667,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 					 strlen(recursion_msg));
 	}
 
+	nmi_message_lost = get_nmi_message_lost();
+	if (unlikely(nmi_message_lost)) {
+		text_len = scnprintf(textbuf, sizeof(textbuf),
+				     "BAD LUCK: lost %d message(s) from NMI context!",
+				     nmi_message_lost);
+		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+					 NULL, 0, textbuf, text_len);
+	}
+
 	/*
 	 * The printf needs to come first; we need the syslog
 	 * prefix which might be passed-in as a parameter.
-- 
cgit v1.2.3


From 427934b8714ec130b068d1c9d88f25b24aaede32 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:39 -0700
Subject: printk/nmi: increase the size of NMI buffer and make it configurable

Testing has shown that the backtrace sometimes does not fit into the 4kB
temporary buffer that is used in NMI context.  The warnings are gone
when I double the temporary buffer size.

This patch doubles the buffer size and makes it configurable.

Note that this problem existed even in the x86-specific implementation
that was added by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI
stack trace on all CPUs").  Nobody noticed it because it did not print
any warnings.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 572f94922230..bf08557d7e3d 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -41,7 +41,8 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_nmi_irq_ready;
 atomic_t nmi_message_lost;
 
-#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) -		\
+			 sizeof(atomic_t) - sizeof(struct irq_work))
 
 struct nmi_seq_buf {
 	atomic_t		len;	/* length of written data */
-- 
cgit v1.2.3


From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:42 -0700
Subject: printk/nmi: flush NMI messages on the system panic

In NMI context, printk() messages are stored into per-CPU buffers to
avoid a possible deadlock.  They are normally flushed to the main ring
buffer via an IRQ work.  But the work is never called when the system
calls panic() in the very same NMI handler.

This patch tries to flush NMI buffers before the crash dump is
generated.  In this case it does not risk a double release and bails out
when the logbuf_lock is already taken.  The aim is to get the messages
into the main ring buffer when possible.  It makes them better
accessible in the vmcore.

Then the patch tries to flush the buffers second time when other CPUs
are down.  It might be more aggressive and reset logbuf_lock.  The aim
is to get the messages available for the consequent kmsg_dump() and
console_flush_on_panic() calls.

The patch causes vprintk_emit() to be called even in NMI context again.
But it is done via printk_deferred() so that the console handling is
skipped.  Consoles use internal locks and we could not prevent a
deadlock easily.  They are explicitly called later when the crash dump
is not generated, see console_flush_on_panic().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c      |  1 +
 kernel/panic.c           |  6 +++++-
 kernel/printk/internal.h |  2 ++
 kernel/printk/nmi.c      | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |  2 +-
 5 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1c03dfb4abfd..d5d408252992 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
+		printk_nmi_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers)
+	if (!crash_kexec_post_notifiers) {
+		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
+	}
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	/* Call flush even twice. It tries harder with a single online CPU */
+	printk_nmi_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 341bedccc065..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
+extern raw_spinlock_t logbuf_lock;
+
 /*
  * printk() could not take logbuf_lock in NMI context. Instead,
  * it temporary stores the strings into a per-CPU buffer.
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bf08557d7e3d..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -17,6 +17,7 @@
 
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/irq_work.h>
@@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
 	const char *buf = s->buffer + start;
 
-	printk("%.*s", (end - start) + 1, buf);
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", (end - start) + 1, buf);
+	else
+		printk("%.*s", (end - start) + 1, buf);
+
 }
 
 /*
@@ -194,6 +204,33 @@ void printk_nmi_flush(void)
 		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
 }
 
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_nmi_flush();
+}
+
 void __init printk_nmi_init(void)
 {
 	int cpu;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e38579d730f4..60cdf6386763 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,7 +245,7 @@ __packed __aligned(4)
  * within the scheduler's rq lock. It must be released before calling
  * console_unlock() or anything else that might wake up a process.
  */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
-- 
cgit v1.2.3


From ede9c27749b9b35efdffa4f63a39f819d7913752 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:10 -0700
Subject: kernel/sysctl_binary.c: use generic UUID library

UUID library provides uuid_be type and uuid_be_to_bin() function.  This
substitutes open coded variant by generic library calls.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_binary.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..6eb99c17dbd8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
 #include <linux/kernel.h>
+#include <linux/uuid.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
@@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file,
 
 	/* Only supports reads */
 	if (oldval && oldlen) {
-		char buf[40], *str = buf;
-		unsigned char uuid[16];
-		int i;
+		char buf[UUID_STRING_LEN + 1];
+		uuid_be uuid;
 
 		result = kernel_read(file, 0, buf, sizeof(buf) - 1);
 		if (result < 0)
@@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file,
 
 		buf[result] = '\0';
 
-		/* Convert the uuid to from a string to binary */
-		for (i = 0; i < 16; i++) {
-			result = -EIO;
-			if (!isxdigit(str[0]) || !isxdigit(str[1]))
-				goto out;
-
-			uuid[i] = (hex_to_bin(str[0]) << 4) |
-					hex_to_bin(str[1]);
-			str += 2;
-			if (*str == '-')
-				str++;
-		}
+		result = -EIO;
+		if (uuid_be_to_bin(buf, &uuid))
+			goto out;
 
 		if (oldlen > 16)
 			oldlen = 16;
 
 		result = -EFAULT;
-		if (copy_to_user(oldval, uuid, oldlen))
+		if (copy_to_user(oldval, &uuid, oldlen))
 			goto out;
 
 		copied = oldlen;
-- 
cgit v1.2.3


From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:33 -0700
Subject: radix-tree: introduce radix_tree_empty

Commit e61452365372 ("radix_tree: add support for multi-order entries")
left the impression that the support for multiorder radix tree entries
was functional.  As soon as Ross tried to use it, it became apparent
that my testing was completely inadequate, and it didn't even work a
little bit for orders that were not a multiple of shift.

This series of patches is the result of about 6 weeks of redesign,
reimplementation, testing, arguing and hair-pulling.  The great news is
that the test-suite is now far better than it was.  That's reflected in
the diffstat for the test-suite alone:

 12 files changed, 436 insertions(+), 28 deletions(-)

The highlight for users of the tree is that the restriction on the order
of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix
tree now supports any order between 0 and 64.

For those who are interested in how the tree works, patch 9 is probably
the most interesting one as it introduces the new machinery for handling
sibling entries.

I've tried to be fair in attributing authorship to the person who
contributed the majority of the code in each patch; Ross has been an
invaluable partner in the development of this support and it's fair to
say that each of us has code in every commit.

I should also express my appreciation of the 0day testing.  It prompted
me that I was bloating the tinyconfig in an unacceptable way, and it
bisected to a commit which contained a rather nasty memory-corruption
bug.

This patch (of 29):

The irqdomain code was checking for 0 or 1 entries, not 0 entries like
the comment said they were.  Introduce a new helper that will actually
check for an empty tree.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/irqdomain.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
 
-	/*
-	 * radix_tree_delete() takes care of destroying the root
-	 * node when all entries are removed. Shout if there are
-	 * any mappings left.
-	 */
-	WARN_ON(domain->revmap_tree.height);
+	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
 	list_del(&domain->link);
 
-- 
cgit v1.2.3


From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 May 2016 17:21:27 -0700
Subject: x86: remove more uaccess_32.h complexity

I'm looking at trying to possibly merge the 32-bit and 64-bit versions
of the x86 uaccess.h implementation, but first this needs to be cleaned
up.

For example, the 32-bit version of "__copy_from_user_inatomic()" is
mostly the special cases for the constant size, and it's actually almost
never relevant.  Most users aren't actually using a constant size
anyway, and the few cases that do small constant copies are better off
just using __get_user() instead.

So get rid of the unnecessary complexity.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 3 +--
 kernel/futex.c          | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7edc95edfaee..c01f733ff2e1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	int result;
 
 	pagefault_disable();
-	result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-							sizeof(opcode));
+	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
 	pagefault_enable();
 
 	if (likely(result == 0))
diff --git a/kernel/futex.c b/kernel/futex.c
index c20f06f38ef3..ee25f5ba4aca 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	int ret;
 
 	pagefault_disable();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+	ret = __get_user(*dest, from);
 	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
-- 
cgit v1.2.3


From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 22 May 2016 23:16:18 +0200
Subject: bpf, inode: disallow userns mounts

Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns
to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag.

The original idea was to have a per mountns instance instead of a
single global fs instance, but that didn't work out and we had to
switch to mount_nodev() model. The intent of that middle ground was
that we avoid users who don't play nice to create endless instances
of bpf fs which are difficult to control and discover from an admin
point of view, but at the same time it would have allowed us to be
more flexible with regard to namespaces.

Therefore, since we now did the switch to mount_nodev() as a fix
where individual instances are created, we also need to remove userns
mount flag along with it to avoid running into mentioned situation.
I don't expect any breakage at this early point in time with removing
the flag and we can revisit this later should the requirement for
this come up with future users. This and commit e27f4a942a0e have
been split to facilitate tracking should any of them run into the
unlikely case of causing a regression.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 04be7021f848..318858edb1cd 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = {
 	.name		= "bpf",
 	.mount		= bpf_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 MODULE_ALIAS_FS("bpf");
-- 
cgit v1.2.3


From f43edca7ed08fc02279f2a62015da5cb6aa0ad61 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 23 May 2016 16:22:26 -0700
Subject: ELF/MIPS build fix

CONFIG_MIPS32_N32=y but CONFIG_BINFMT_ELF disabled results in the
following linker errors:

  arch/mips/built-in.o: In function `elf_core_dump':
  binfmt_elfn32.c:(.text+0x23dbc): undefined reference to `elf_core_extra_phdrs'
  binfmt_elfn32.c:(.text+0x246e4): undefined reference to `elf_core_extra_data_size'
  binfmt_elfn32.c:(.text+0x248d0): undefined reference to `elf_core_write_extra_phdrs'
  binfmt_elfn32.c:(.text+0x24ac4): undefined reference to `elf_core_write_extra_data'

CONFIG_MIPS32_O32=y but CONFIG_BINFMT_ELF disabled results in the following
linker errors:

  arch/mips/built-in.o: In function `elf_core_dump':
  binfmt_elfo32.c:(.text+0x28a04): undefined reference to `elf_core_extra_phdrs'
  binfmt_elfo32.c:(.text+0x29330): undefined reference to `elf_core_extra_data_size'
  binfmt_elfo32.c:(.text+0x2951c): undefined reference to `elf_core_write_extra_phdrs'
  binfmt_elfo32.c:(.text+0x29710): undefined reference to `elf_core_write_extra_data'

This is because binfmt_elfn32 and binfmt_elfo32 are using symbols from
elfcore but for these configurations elfcore will not be built.

Fixed by making elfcore selectable by a separate config symbol which
unlike the current mechanism can also be used from other directories
than kernel/, then having each flavor of ELF that relies on elfcore.o,
select it in Kconfig, including CONFIG_MIPS32_N32 and CONFIG_MIPS32_O32
which fixes this issue.

Link: http://lkml.kernel.org/r/20160520141705.GA1913@linux-mips.org
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index f0c40bf49d9f..e2ec54e2b952 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
+obj-$(CONFIG_ELFCORE) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_TRACE_CLOCK) += trace/
-- 
cgit v1.2.3


From bf959931ddb88c4e4366e96dd22e68fa0db9527c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:23:50 -0700
Subject: wait/ptrace: assume __WALL if the child is traced

The following program (simplified version of generated by syzkaller)

	#include <pthread.h>
	#include <unistd.h>
	#include <sys/ptrace.h>
	#include <stdio.h>
	#include <signal.h>

	void *thread_func(void *arg)
	{
		ptrace(PTRACE_TRACEME, 0,0,0);
		return 0;
	}

	int main(void)
	{
		pthread_t thread;

		if (fork())
			return 0;

		while (getppid() != 1)
			;

		pthread_create(&thread, NULL, thread_func, NULL);
		pthread_join(thread, NULL);
		return 0;
	}

creates an unreapable zombie if /sbin/init doesn't use __WALL.

This is not a kernel bug, at least in a sense that everything works as
expected: debugger should reap a traced sub-thread before it can reap the
leader, but without __WALL/__WCLONE do_wait() ignores sub-threads.

Unfortunately, it seems that /sbin/init in most (all?) distributions
doesn't use it and we have to change the kernel to avoid the problem.
Note also that most init's use sys_waitid() which doesn't allow __WALL, so
the necessary user-space fix is not that trivial.

This patch just adds the "ptrace" check into eligible_child().  To some
degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal is
mostly ignored when the tracee reports to debugger.  Or WSTOPPED, the
tracer doesn't need to set this flag to wait for the stopped tracee.

This obviously means the user-visible change: __WCLONE and __WALL no
longer have any meaning for debugger.  And I can only hope that this won't
break something, but at least strace/gdb won't suffer.

We could make a more conservative change.  Say, we can take __WCLONE into
account, or !thread_group_leader().  But it would be nice to not
complicate these historical/confusing checks.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 75b34fe835b2..44fbe6edd7fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return 0;
-	/* Wait for all children (clone and not) if __WALL is set;
-	 * otherwise, wait for clone children *only* if __WCLONE is
-	 * set; otherwise, wait for non-clone children *only*.  (Note:
-	 * A "clone" child here is one that reports to its parent
-	 * using a signal other than SIGCHLD.) */
-	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
-	    && !(wo->wo_flags & __WALL))
+
+	/*
+	 * Wait for all children (clone and not) if __WALL is set or
+	 * if it is traced by us.
+	 */
+	if (ptrace || (wo->wo_flags & __WALL))
+		return 1;
+
+	/*
+	 * Otherwise, wait for clone children *only* if __WCLONE is set;
+	 * otherwise, wait for non-clone children *only*.
+	 *
+	 * Note: a "clone" child here is one that reports to its parent
+	 * using a signal other than SIGCHLD, or a non-leader thread which
+	 * we can only see if it is traced by us.
+	 */
+	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 		return 0;
 
 	return 1;
@@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (unlikely(exit_state == EXIT_DEAD))
 		return 0;
 
-	ret = eligible_child(wo, p);
+	ret = eligible_child(wo, ptrace, p);
 	if (!ret)
 		return ret;
 
-- 
cgit v1.2.3


From 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:23:53 -0700
Subject: wait: allow sys_waitid() to accept __WNOTHREAD/__WCLONE/__WALL

I see no reason why waitid() can't support other linux-specific flags
allowed in sys_wait4().

In particular this change can help if we reconsider the previous change
("wait/ptrace: assume __WALL if the child is traced") which adds the
"automagical" __WALL for debugger.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 44fbe6edd7fe..9e6e1356e6bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1535,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	enum pid_type type;
 	long ret;
 
-	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
 		return -EINVAL;
-- 
cgit v1.2.3


From 747800efbe8b98459f48d1d9d742298f8283f8fa Mon Sep 17 00:00:00 2001
From: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Date: Mon, 23 May 2016 16:23:59 -0700
Subject: kernel/signal.c: convert printk(KERN_<LEVEL> ...) to pr_<level>(...)

Use pr_<level> instead of printk(KERN_<LEVEL> ).

Signed-off-by: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ab122a2cee41..96e9bc40667f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig)
 	if (!__ratelimit(&ratelimit_state))
 		return;
 
-	printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+	pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
 				current->comm, current->pid, sig);
 }
 
@@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 static void print_fatal_signal(int signr)
 {
 	struct pt_regs *regs = signal_pt_regs();
-	printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
+	pr_info("potentially unexpected fatal signal %d.\n", signr);
 
 #if defined(__i386__) && !defined(__arch_um__)
-	printk(KERN_INFO "code at %08lx: ", regs->ip);
+	pr_info("code at %08lx: ", regs->ip);
 	{
 		int i;
 		for (i = 0; i < 16; i++) {
@@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr)
 
 			if (get_user(insn, (unsigned char *)(regs->ip + i)))
 				break;
-			printk(KERN_CONT "%02x ", insn);
+			pr_cont("%02x ", insn);
 		}
 	}
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 #endif
 	preempt_disable();
 	show_regs(regs);
-- 
cgit v1.2.3


From 725fc629ff2545b061407305ae51016c9f928fce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 23 May 2016 16:24:05 -0700
Subject: kernek/fork.c: allocate idle task for a CPU always on its local node

Linux preallocates the task structs of the idle tasks for all possible
CPUs.  This currently means they all end up on node 0.  This also
implies that the cache line of MWAIT, which is around the flags field in
the task struct, are all located in node 0.

We see a noticeable performance improvement on Knights Landing CPUs when
the cache lines used for MWAIT are located in the local nodes of the
CPUs using them.  I would expect this to give a (likely slight)
improvement on other systems too.

The patch implements placing the idle task in the node of its CPUs, by
passing the right target node to copy_process()

[akpm@linux-foundation.org: use NUMA_NO_NODE, not a bare -1]
Link: http://lkml.kernel.org/r/1463492694-15833-1-git-send-email-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 103d78fd8f75..e67d7b773348 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -340,13 +340,14 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
 
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
-	int node = tsk_fork_get_node(orig);
 	int err;
 
+	if (node == NUMA_NO_NODE)
+		node = tsk_fork_get_node(orig);
 	tsk = alloc_task_struct_node(node);
 	if (!tsk)
 		return NULL;
@@ -1276,7 +1277,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
-					unsigned long tls)
+					unsigned long tls,
+					int node)
 {
 	int retval;
 	struct task_struct *p;
@@ -1328,7 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	retval = -ENOMEM;
-	p = dup_task_struct(current);
+	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
 
@@ -1706,7 +1708,8 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1751,7 +1754,7 @@ long _do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace, tls);
+			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
-- 
cgit v1.2.3


From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:10 -0700
Subject: kexec: introduce a protection mechanism for the crashkernel reserved
 memory

For the cases that some kernel (module) path stamps the crash reserved
memory(already mapped by the kernel) where has been loaded the second
kernel data, the kdump kernel will probably fail to boot when panic
happens (or even not happens) leaving the culprit at large, this is
unacceptable.

The patch introduces a mechanism for detecting such cases:

1) After each crash kexec loading, it simply marks the reserved memory
   regions readonly since we no longer access it after that.  When someone
   stamps the region, the first kernel will panic and trigger the kdump.
   The weak arch_kexec_protect_crashkres() is introduced to do the actual
   protection.

2) To allow multiple loading, once 1) was done we also need to remark
   the reserved memory to readwrite each time a system call related to
   kdump is made.  The weak arch_kexec_unprotect_crashkres() is introduced
   to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c      | 9 ++++++++-
 kernel/kexec_core.c | 6 ++++++
 kernel/kexec_file.c | 8 +++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..b44cb3f5a15c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH)
+	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
+
 	if (nr_segments > 0) {
 		unsigned long i;
 
@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5d408252992..48b73cc8e425 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void)
 
 void __weak crash_unmap_reserved_pages(void)
 {}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..503bc2d348e5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_FILE_ON_CRASH)
+	if (flags & KEXEC_FILE_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 	return ret;
-- 
cgit v1.2.3


From 917a35605f09c0d16aeb2e92c7fbff562e19a116 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnfhuang@gmail.com>
Date: Mon, 23 May 2016 16:24:16 -0700
Subject: kexec: make a pair of map/unmap reserved pages in error path

For some arch, kexec shall map the reserved pages, then use them, when
we try to start the kdump service.

kexec may return directly, without unmaping the reserved pages, if it
fails during starting service.  To fix it, we make a pair of map/unmap
reserved pages both in generic path and error path.

This patch only affects s390.  Other architecturess don't implement the
interface of crash_unmap_reserved_pages and crash_map_reserved_pages.

It isn't a urgent patch.  Kernel can work well without any risk,
although the reserved pages are not unmapped before returning in error
path.

Signed-off-by: Minfei Huang <mnfhuang@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index b44cb3f5a15c..4b49aa71304f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -194,22 +194,25 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 						   segments, flags);
 		}
 		if (result)
-			goto out;
+			goto unmap_page;
 
 		if (flags & KEXEC_PRESERVE_CONTEXT)
 			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
-			goto out;
+			goto unmap_page;
 
 		for (i = 0; i < nr_segments; i++) {
 			result = kimage_load_segment(image, &image->segment[i]);
 			if (result)
-				goto out;
+				goto unmap_page;
 		}
 		kimage_terminate(image);
+unmap_page:
 		if (flags & KEXEC_ON_CRASH)
 			crash_unmap_reserved_pages();
+		if (result)
+			goto out;
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
-- 
cgit v1.2.3


From 0eea08678ebe9f7d8ef98fed974a5bf1a0dd2dd2 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnfhuang@gmail.com>
Date: Mon, 23 May 2016 16:24:19 -0700
Subject: kexec: do a cleanup for function kexec_load

There are a lof of work to be done in function kexec_load, not only for
allocating structs and loading initram, but also for some misc.

To make it more clear, wrap a new function do_kexec_load which is used
to allocate structs and load initram.  And the pre-work will be done in
kexec_load.

Signed-off-by: Minfei Huang <mnfhuang@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 125 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 69 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b49aa71304f..b73dc211fcfd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -103,6 +103,74 @@ out_free_image:
 	return ret;
 }
 
+static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
+		struct kexec_segment __user *segments, unsigned long flags)
+{
+	struct kimage **dest_image, *image;
+	unsigned long i;
+	int ret;
+
+	if (flags & KEXEC_ON_CRASH) {
+		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	} else {
+		dest_image = &kexec_image;
+	}
+
+	if (nr_segments == 0) {
+		/* Uninstall image */
+		kimage_free(xchg(dest_image, NULL));
+		return 0;
+	}
+	if (flags & KEXEC_ON_CRASH) {
+		/*
+		 * Loading another kernel to switch to if this one
+		 * crashes.  Free any current crash dump kernel before
+		 * we corrupt it.
+		 */
+		kimage_free(xchg(&kexec_crash_image, NULL));
+	}
+
+	ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
+	if (ret)
+		return ret;
+
+	if (flags & KEXEC_ON_CRASH)
+		crash_map_reserved_pages();
+
+	if (flags & KEXEC_PRESERVE_CONTEXT)
+		image->preserve_context = 1;
+
+	ret = machine_kexec_prepare(image);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < nr_segments; i++) {
+		ret = kimage_load_segment(image, &image->segment[i]);
+		if (ret)
+			goto out;
+	}
+
+	kimage_terminate(image);
+
+	/* Install the new kernel and uninstall the old */
+	image = xchg(dest_image, image);
+
+out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
+	/*
+	 * Once the reserved memory is mapped, we should unmap this memory
+	 * before returning
+	 */
+	if (flags & KEXEC_ON_CRASH)
+		crash_unmap_reserved_pages();
+	kimage_free(image);
+	return ret;
+}
+
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
  *
@@ -127,7 +195,6 @@ out_free_image:
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		struct kexec_segment __user *, segments, unsigned long, flags)
 {
-	struct kimage **dest_image, *image;
 	int result;
 
 	/* We only trust the superuser with rebooting the system. */
@@ -152,9 +219,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	if (nr_segments > KEXEC_SEGMENT_MAX)
 		return -EINVAL;
 
-	image = NULL;
-	result = 0;
-
 	/* Because we write directly to the reserved memory
 	 * region when loading crash kernels we need a mutex here to
 	 * prevent multiple crash  kernels from attempting to load
@@ -166,63 +230,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 
-	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH) {
-		dest_image = &kexec_crash_image;
-		if (kexec_crash_image)
-			arch_kexec_unprotect_crashkres();
-	}
-
-	if (nr_segments > 0) {
-		unsigned long i;
-
-		if (flags & KEXEC_ON_CRASH) {
-			/*
-			 * Loading another kernel to switch to if this one
-			 * crashes.  Free any current crash dump kernel before
-			 * we corrupt it.
-			 */
-
-			kimage_free(xchg(&kexec_crash_image, NULL));
-			result = kimage_alloc_init(&image, entry, nr_segments,
-						   segments, flags);
-			crash_map_reserved_pages();
-		} else {
-			/* Loading another kernel to reboot into. */
-
-			result = kimage_alloc_init(&image, entry, nr_segments,
-						   segments, flags);
-		}
-		if (result)
-			goto unmap_page;
-
-		if (flags & KEXEC_PRESERVE_CONTEXT)
-			image->preserve_context = 1;
-		result = machine_kexec_prepare(image);
-		if (result)
-			goto unmap_page;
-
-		for (i = 0; i < nr_segments; i++) {
-			result = kimage_load_segment(image, &image->segment[i]);
-			if (result)
-				goto unmap_page;
-		}
-		kimage_terminate(image);
-unmap_page:
-		if (flags & KEXEC_ON_CRASH)
-			crash_unmap_reserved_pages();
-		if (result)
-			goto out;
-	}
-	/* Install the new kernel, and  Uninstall the old */
-	image = xchg(dest_image, image);
+	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
 	mutex_unlock(&kexec_mutex);
-	kimage_free(image);
 
 	return result;
 }
-- 
cgit v1.2.3


From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:22 -0700
Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and
 arch_kexec_protect(unprotect)_crashkres()

Commit 3f625002581b ("kexec: introduce a protection mechanism for the
crashkernel reserved memory") is a similar mechanism for protecting the
crash kernel reserved memory to previous crash_map/unmap_reserved_pages()
implementation, the new one is more generic in name and cleaner in code
(besides, some arch may not be allowed to unmap the pgtable).

Therefore, this patch consolidates them, and uses the new
arch_kexec_protect(unprotect)_crashkres() to replace former
crash_map/unmap_reserved_pages() which by now has been only used by
S390.

The consolidation work needs the crash memory to be mapped initially,
this is done in machine_kdump_pm_init() which is after
reserve_crashkernel().  Once kdump kernel is loaded, the new
arch_kexec_protect_crashkres() implemented for S390 will actually
unmap the pgtable like before.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c      | 12 ------------
 kernel/kexec_core.c | 11 ++---------
 2 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index b73dc211fcfd..4384672d3245 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (ret)
 		return ret;
 
-	if (flags & KEXEC_ON_CRASH)
-		crash_map_reserved_pages();
-
 	if (flags & KEXEC_PRESERVE_CONTEXT)
 		image->preserve_context = 1;
 
@@ -161,12 +158,6 @@ out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
-	/*
-	 * Once the reserved memory is mapped, we should unmap this memory
-	 * before returning
-	 */
-	if (flags & KEXEC_ON_CRASH)
-		crash_unmap_reserved_pages();
 	kimage_free(image);
 	return ret;
 }
@@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 
 	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
-		arch_kexec_protect_crashkres();
-
 	mutex_unlock(&kexec_mutex);
 
 	return result;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 48b73cc8e425..56b3ed0927b0 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size)
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
-	crash_map_reserved_pages();
 	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
@@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size)
 	crashk_res.end = end - 1;
 
 	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
 
 unlock:
 	mutex_unlock(&kexec_mutex);
@@ -1553,17 +1551,12 @@ int kernel_kexec(void)
 }
 
 /*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
  *
  * Provide an empty default implementation here -- architecture
  * code may override this
  */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 void __weak arch_kexec_protect_crashkres(void)
 {}
 
-- 
cgit v1.2.3


From 7c051267931a9be9c6620cc17b362bc6ee6dedc8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:48 -0700
Subject: mm, fork: make dup_mmap wait for mmap_sem for write killable

dup_mmap needs to lock current's mm mmap_sem for write.  If the waiting
task gets killed by the oom killer it would block oom_reaper from
asynchronous address space reclaim and reduce the chances of timely OOM
resolving.  Wait for the lock in the killable mode and return with EINTR
if the task got killed while waiting.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e67d7b773348..47887bba944f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -414,7 +414,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	unsigned long charge;
 
 	uprobe_start_dup_mmap();
-	down_write(&oldmm->mmap_sem);
+	if (down_write_killable(&oldmm->mmap_sem)) {
+		retval = -EINTR;
+		goto fail_uprobe_end;
+	}
 	flush_cache_dup_mm(oldmm);
 	uprobe_dup_mmap(oldmm, mm);
 	/*
@@ -526,6 +529,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
 fail_nomem_anon_vma_fork:
-- 
cgit v1.2.3


From 17b0573d77fbd547cbf1d711b90d269bdcc1efd3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:26:05 -0700
Subject: prctl: make PR_SET_THP_DISABLE wait for mmap_sem killable

PR_SET_THP_DISABLE requires mmap_sem for write.  If the waiting task
gets killed by the oom killer it would block oom_reaper from
asynchronous address space reclaim and reduce the chances of timely OOM
resolving.  Wait for the lock in the killable mode and return with EINTR
if the task got killed while waiting.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index cf8ba545c7d3..89d5be418157 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_THP_DISABLE:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-		down_write(&me->mm->mmap_sem);
+		if (down_write_killable(&me->mm->mmap_sem))
+			return -EINTR;
 		if (arg2)
 			me->mm->def_flags |= VM_NOHUGEPAGE;
 		else
-- 
cgit v1.2.3


From 598fdc1d66674264e122ca9d007ad822e98d8b8d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:26:08 -0700
Subject: uprobes: wait for mmap_sem for write killable

xol_add_vma needs mmap_sem for write.  If the waiting task gets killed
by the oom killer it would block oom_reaper from asynchronous address
space reclaim and reduce the chances of timely OOM resolving.  Wait for
the lock in the killable mode and return with EINTR if the task got
killed while waiting.

Do not warn in dup_xol_work if __create_xol_area failed due to fatal
signal pending because this is usually considered a kernel issue.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c01f733ff2e1..b7a525ab2083 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1130,7 +1130,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 	struct vm_area_struct *vma;
 	int ret;
 
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	if (mm->uprobes_state.xol_area) {
 		ret = -EALREADY;
 		goto fail;
@@ -1469,7 +1471,8 @@ static void dup_xol_work(struct callback_head *work)
 	if (current->flags & PF_EXITING)
 		return;
 
-	if (!__create_xol_area(current->utask->dup_xol_addr))
+	if (!__create_xol_area(current->utask->dup_xol_addr) &&
+			!fatal_signal_pending(current))
 		uprobe_warn(current, "dup xol area");
 }
 
-- 
cgit v1.2.3


From 59fa5860204ffc95128d60cba9f54f9740a42c7d Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 24 May 2016 11:42:30 +0100
Subject: genirq: Fix missing return value in irq_destroy_ipi()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 7cec18a3906b changed the return type of irq_destroy_ipi to int, but
missed adding a value to one return statement. Fix this to silence the
resulting compiler warning:

kernel/irq/ipi.c In function ‘irq_destroy_ipi’:
kernel/irq/ipi.c:128:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type]

Fixes: 7cec18a3906b "genirq: Add error code reporting to irq_{reserve,destroy}_ipi"
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/1464086550-24734-1-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/ipi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c42742208e5e..89b49f6773f0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 
 	domain = data->domain;
 	if (WARN_ON(domain == NULL))
-		return;
+		return -EINVAL;
 
 	if (!irq_domain_is_ipi(domain)) {
 		pr_warn("Trying to destroy a non IPI domain!\n");
-- 
cgit v1.2.3


From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 May 2016 11:19:07 +0200
Subject: sched/core: Fix remote wakeups

Commit:

  b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")

... introduced a bug: Mike Galbraith found that it introduced a
performance regression, while Paul E. McKenney reported lost
wakeups and bisected it to this commit.

The reason is that I mis-read ttwu_queue() such that I assumed any
wakeup that got a remote queue must have had the task migrated.

Since this is not so; we need to transfer this information between
queueing the wakeup and actually doing the wakeup. Use a new
task_struct::sched_flag for this, we already write to
sched_contributes_to_load in the wakeup path so this is a hot and
modified cacheline.

Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Segall <bsegall@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")
Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
 	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
+		int wake_flags = 0;
+
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		/*
-		 * See ttwu_queue(); we only call ttwu_queue_remote() when
-		 * its a x-cpu wakeup.
-		 */
-		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+		if (p->sched_remote_wakeup)
+			wake_flags = WF_MIGRATED;
+
+		ttwu_do_activate(rq, p, wake_flags, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
 	irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 
+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
 			smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
-		ttwu_queue_remote(p, cpu);
+		ttwu_queue_remote(p, cpu, wake_flags);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 May 2016 00:04:58 -0400
Subject: add down_write_killable_nested()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/locking/rwsem.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c817216c1615..2e853ad93a3a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
 void up_read_non_owner(struct rw_semaphore *sem)
 {
 	__up_read(sem);
-- 
cgit v1.2.3


From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 26 May 2016 15:16:22 -0700
Subject: mm: oom_reaper: remove some bloat

mmput_async is currently used only from the oom_reaper which is defined
only for CONFIG_MMU.  We can save work_struct in mm_struct for
!CONFIG_MMU.

[akpm@linux-foundation.org: fix typo, per Minchan]
Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz
Reported-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 47887bba944f..5c2c355aa97f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
 		schedule_work(&mm->async_put_work);
 	}
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
-- 
cgit v1.2.3


From 287980e49ffc0f6d911601e7e352a812ed27768e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 May 2016 23:23:25 +0200
Subject: remove lots of IS_ERR_VALUE abuses

Most users of IS_ERR_VALUE() in the kernel are wrong, as they
pass an 'int' into a function that takes an 'unsigned long'
argument. This happens to work because the type is sign-extended
on 64-bit architectures before it gets converted into an
unsigned type.

However, anything that passes an 'unsigned short' or 'unsigned int'
argument into IS_ERR_VALUE() is guaranteed to be broken, as are
8-bit integers and types that are wider than 'unsigned long'.

Andrzej Hajda has already fixed a lot of the worst abusers that
were causing actual bugs, but it would be nice to prevent any
users that are not passing 'unsigned long' arguments.

This patch changes all users of IS_ERR_VALUE() that I could find
on 32-bit ARM randconfig builds and x86 allmodconfig. For the
moment, this doesn't change the definition of IS_ERR_VALUE()
because there are probably still architecture specific users
elsewhere.

Almost all the warnings I got are for files that are better off
using 'if (err)' or 'if (err < 0)'.
The only legitimate user I could find that we get a warning for
is the (32-bit only) freescale fman driver, so I did not remove
the IS_ERR_VALUE() there but changed the type to 'unsigned long'.
For 9pfs, I just worked around one user whose calling conventions
are so obscure that I did not dare change the behavior.

I was using this definition for testing:

 #define IS_ERR_VALUE(x) ((unsigned long*)NULL == (typeof (x)*)NULL && \
       unlikely((unsigned long long)(x) >= (unsigned long long)(typeof(x))-MAX_ERRNO))

which ends up making all 16-bit or wider types work correctly with
the most plausible interpretation of what IS_ERR_VALUE() was supposed
to return according to its users, but also causes a compile-time
warning for any users that do not pass an 'unsigned long' argument.

I suggested this approach earlier this year, but back then we ended
up deciding to just fix the users that are obviously broken. After
the initial warning that caused me to get involved in the discussion
(fs/gfs2/dir.c) showed up again in the mainline kernel, Linus
asked me to send the whole thing again.

[ Updated the 9p parts as per Al Viro  - Linus ]

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lkml.org/lkml/2016/1/7/363
Link: https://lkml.org/lkml/2016/5/27/486
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> # For nvmem part
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 4d73a834c7e6..f66162f2359b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
-		if (IS_ERR_VALUE(nr)) {
+		if (nr < 0) {
 			retval = nr;
 			goto out_free;
 		}
-- 
cgit v1.2.3


From c08376ac97cb202ec65320f3d90d5c4c5e2adb0b Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 26 May 2016 17:21:05 -0700
Subject: timer: Export destroy_hrtimer_on_stack()

hrtimer_init_on_stack() needs a matching call to
destroy_hrtimer_on_stack(), so both need to be exported.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/time/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8c7392c4fdbd..e99df0ff1d42 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
 }
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 
 #else
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
-- 
cgit v1.2.3


From 9bd616e3dbedfc103f158197c8ad93678849b1ed Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 1 Jun 2016 18:52:16 +0100
Subject: cpuidle: Do not access cpuidle_devices when !CONFIG_CPU_IDLE

The cpuidle_devices per-CPU variable is only defined when CPU_IDLE is
enabled. Commit c8cc7d4de7a4 ("sched/idle: Reorganize the idle loop")
removed the #ifdef CONFIG_CPU_IDLE around cpuidle_idle_call() with the
compiler optimising away __this_cpu_read(cpuidle_devices). However, with
CONFIG_UBSAN && !CONFIG_CPU_IDLE, this optimisation no longer happens
and the kernel fails to link since cpuidle_devices is not defined.

This patch introduces an accessor function for the current CPU cpuidle
device (returning NULL when !CONFIG_CPU_IDLE) and uses it in
cpuidle_idle_call().

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: 4.5+ <stable@vger.kernel.org> # 4.5+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..c5aeedf4e93a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 static void cpuidle_idle_call(void)
 {
-	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_device *dev = cpuidle_get_device();
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
 
-- 
cgit v1.2.3


From 0422e83d84ae24b933e4b0d4c1e0f0b4ae8a0a3b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 26 May 2016 21:08:17 +0100
Subject: locking/ww_mutex: Report recursive ww_mutex locking early

Recursive locking for ww_mutexes was originally conceived as an
exception. However, it is heavily used by the DRM atomic modesetting
code. Currently, the recursive deadlock is checked after we have queued
up for a busy-spin and as we never release the lock, we spin until
kicked, whereupon the deadlock is discovered and reported.

A simple solution for the now common problem is to move the recursive
deadlock discovery to the first action when taking the ww_mutex.

Suggested-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1464293297-19777-1-git-send-email-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..79d2d765a75f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	if (!hold_ctx)
 		return 0;
 
-	if (unlikely(ctx == hold_ctx))
-		return -EALREADY;
-
 	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
 	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 	int ret;
 
+	if (use_ww_ctx) {
+		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+			return -EALREADY;
+	}
+
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-- 
cgit v1.2.3


From 5b6c1b4d46b0dae4edea636a776d09f2064f4cd7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 4 Jun 2016 20:50:59 +0200
Subject: bpf, trace: use READ_ONCE for retrieving file ptr

In bpf_perf_event_read() and bpf_perf_event_output(), we must use
READ_ONCE() for fetching the struct file pointer, which could get
updated concurrently, so we must prevent the compiler from potential
refetching.

We already do this with tail calls for fetching the related bpf_prog,
but not so on stored perf events. Semantics for both are the same
with regards to updates.

Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 780bcbe1d4de..720b7bb01d43 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -198,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = (struct file *)array->ptrs[index];
+	file = READ_ONCE(array->ptrs[index]);
 	if (unlikely(!file))
 		return -ENOENT;
 
@@ -247,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = (struct file *)array->ptrs[index];
+	file = READ_ONCE(array->ptrs[index]);
 	if (unlikely(!file))
 		return -ENOENT;
 
-- 
cgit v1.2.3


From 2c610022711675ee908b903d242f0b90e1db661f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Jun 2016 10:19:51 +0200
Subject: locking/qspinlock: Fix spin_unlock_wait() some more

While this prior commit:

  54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()")

... fixes spin_is_locked() and spin_unlock_wait() for the usage
in ipc/sem and netfilter, it does not in fact work right for the
usage in task_work and futex.

So while the 2 locks crossed problem:

	spin_lock(A)		spin_lock(B)
	if (!spin_is_locked(B)) spin_unlock_wait(A)
	  foo()			foo();

... works with the smp_mb() injected by both spin_is_locked() and
spin_unlock_wait(), this is not sufficient for:

	flag = 1;
	smp_mb();		spin_lock()
	spin_unlock_wait()	if (!flag)
				  // add to lockless list
	// iterate lockless list

... because in this scenario, the store from spin_lock() can be delayed
past the load of flag, uncrossing the variables and loosing the
guarantee.

This patch reworks spin_is_locked() and spin_unlock_wait() to work in
both cases by exploiting the observation that while the lock byte
store can be delayed, the contender must have registered itself
visibly in other state contained in the word.

It also allows for architectures to override both functions, as PPC
and ARM64 have an additional issue for which we currently have no
generic solution.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Giovanni Gherdovich <ggherdovich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org # v4.2 and later
Fixes: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..5fc8c311b8fe 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -267,6 +267,66 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1
+ *
+ *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0
+ *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+	u32 val;
+
+	for (;;) {
+		val = atomic_read(&lock->val);
+
+		if (!val) /* not locked, we're done */
+			goto done;
+
+		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+			break;
+
+		/* not locked, but pending, wait until we observe the lock */
+		cpu_relax();
+	}
+
+	/* any unlock is good */
+	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+		cpu_relax();
+
+done:
+	smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
-- 
cgit v1.2.3


From 62a92c8f553e49270a0ee391b8733da71ab0aebc Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 7 Jun 2016 15:44:15 +0300
Subject: perf/core: Remove a redundant check

There is no way to end up in _free_event() with event::pmu being NULL.
The latter is initialized in event allocation path and remains set
forever. In case of allocation failure, the error path doesn't use
_free_event().

Having the check, however, suggests that it is possible to have a
event::pmu==NULL situation in _free_event() and confuses the robots.

This patch gets rid of the check.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1465303455-26032-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 050a290c72c7..87e945d6ebb8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	if (event->pmu) {
-		exclusive_event_destroy(event);
-		module_put(event->pmu->module);
-	}
+	exclusive_event_destroy(event);
+	module_put(event->pmu->module);
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
-- 
cgit v1.2.3


From 9c57259117b9c25472a3fa6d5a14d6bb3b647e87 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 3 Jun 2016 17:58:40 -0500
Subject: sched/debug: Fix /proc/sched_debug regression

Commit:

  cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")

... introduced a bug when CONFIG_SCHEDSTATS is enabled and the
runtime tunable is disabled (which is the default).

The wait-time, sum-exec, and sum-sleep fields are missing from the
/proc/sched_debug file in the runnable_tasks section.

Fix it with a new schedstat_val() macro which returns the field value
when schedstats is enabled and zero otherwise.  The macro works with
both SCHEDSTATS and !SCHEDSTATS.  I put the macro in stats.h since it
might end up being useful in other places.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/bcda7c2790cf2ccbe586a28c02dd7b6fe7749a2b.1464994423.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 15 ++++-----------
 kernel/sched/stats.h |  3 +++
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..0368c393a336 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
-#ifdef CONFIG_SCHEDSTATS
-	if (schedstat_enabled()) {
-		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-			SPLIT_NS(p->se.statistics.wait_sum),
-			SPLIT_NS(p->se.sum_exec_runtime),
-			SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-	}
-#else
+
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		0LL, 0L,
+		SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		0LL, 0L);
-#endif
+		SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+
 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
 # define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
 # define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
+# define schedstat_val(rq, field)	((schedstat_enabled()) ? (rq)->field : 0)
+
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 # define schedstat_set(var, val)	do { } while (0)
+# define schedstat_val(rq, field)	0
 #endif
 
 #ifdef CONFIG_SCHED_INFO
-- 
cgit v1.2.3


From 4698f88c06b893f2acc0b443004a53bf490fde7c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 7 Jun 2016 14:43:16 -0500
Subject: sched/debug: Fix 'schedstats=enable' cmdline option

The 'schedstats=enable' option doesn't work, and also produces the
following warning during boot:

  WARNING: CPU: 0 PID: 0 at /home/jpoimboe/git/linux/kernel/jump_label.c:61 static_key_slow_inc+0x8c/0xa0
  static_key_slow_inc used before call to jump_label_init
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc1+ #25
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
   0000000000000086 3ae3475a4bea95d4 ffffffff81e03da8 ffffffff8143fc83
   ffffffff81e03df8 0000000000000000 ffffffff81e03de8 ffffffff810b1ffb
   0000003d00000096 ffffffff823514d0 ffff88007ff197c8 0000000000000000
  Call Trace:
   [<ffffffff8143fc83>] dump_stack+0x85/0xc2
   [<ffffffff810b1ffb>] __warn+0xcb/0xf0
   [<ffffffff810b207f>] warn_slowpath_fmt+0x5f/0x80
   [<ffffffff811e9c0c>] static_key_slow_inc+0x8c/0xa0
   [<ffffffff810e07c6>] static_key_enable+0x16/0x40
   [<ffffffff8216d633>] setup_schedstats+0x29/0x94
   [<ffffffff82148a05>] unknown_bootoption+0x89/0x191
   [<ffffffff810d8617>] parse_args+0x297/0x4b0
   [<ffffffff82148d61>] start_kernel+0x1d8/0x4a9
   [<ffffffff8214897c>] ? set_init_arg+0x55/0x55
   [<ffffffff82148120>] ? early_idt_handler_array+0x120/0x120
   [<ffffffff821482db>] x86_64_start_reservations+0x2f/0x31
   [<ffffffff82148427>] x86_64_start_kernel+0x14a/0x16d

The problem is that it tries to update the 'sched_schedstats' static key
before jump labels have been initialized.

Changing jump_label_init() to be called earlier before
parse_early_param() wouldn't fix it: it would still fail trying to
poke_text() because mm isn't yet initialized.

Instead, just create a temporary '__sched_schedstats' variable which can
be copied to the static key later during sched_init() after jump labels
have been initialized.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/453775fe3433bed65731a583e228ccea806d18cd.1465322027.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..385c947482e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
 
-#ifdef CONFIG_SCHEDSTATS
 static void set_schedstats(bool enabled)
 {
 	if (enabled)
@@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str)
 	if (!str)
 		goto out;
 
+	/*
+	 * This code is called before jump labels have been set up, so we can't
+	 * change the static branch directly just yet.  Instead set a temporary
+	 * variable so init_schedstats() can do it later.
+	 */
 	if (!strcmp(str, "enable")) {
-		set_schedstats(true);
+		__sched_schedstats = true;
 		ret = 1;
 	} else if (!strcmp(str, "disable")) {
-		set_schedstats(false);
+		__sched_schedstats = false;
 		ret = 1;
 	}
 out:
@@ -2293,6 +2300,11 @@ out:
 }
 __setup("schedstats=", setup_schedstats);
 
+static void __init init_schedstats(void)
+{
+	set_schedstats(__sched_schedstats);
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_schedstats(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
 		set_schedstats(state);
 	return err;
 }
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
 
 /*
  * fork()/clone()-time setup:
@@ -7487,6 +7501,8 @@ void __init sched_init(void)
 #endif
 	init_sched_fair_class();
 
+	init_schedstats();
+
 	scheduler_running = 1;
 }
 
-- 
cgit v1.2.3


From 077fa7aed17de5022e44bf07dbaf732078b7b5b2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 8 Jun 2016 14:25:22 +0100
Subject: futex: Calculate the futex key based on a tail page for file-based
 futexes

Mike Galbraith reported that the LTP test case futex_wake04 was broken
by commit 65d8fc777f6d ("futex: Remove requirement for lock_page()
in get_futex_key()").

This test case uses futexes backed by hugetlbfs pages and so there is an
associated inode with a futex stored on such pages. The problem is that
the key is being calculated based on the head page index of the hugetlbfs
page and not the tail page.

Prior to the optimisation, the page lock was used to stabilise mappings and
pin the inode is file-backed which is overkill. If the page was a compound
page, the head page was automatically looked up as part of the page lock
operation but the tail page index was used to calculate the futex key.

After the optimisation, the compound head is looked up early and the page
lock is only relied upon to identify truncated pages, special pages or a
shmem page moving to swapcache. The head page is looked up because without
the page lock, special care has to be taken to pin the inode correctly.
However, the tail page is still required to calculate the futex key so
this patch records the tail page.

On vanilla 4.6, the output of the test case is;

futex_wake04    0  TINFO  :  Hugepagesize 2097152
futex_wake04    1  TFAIL  :  futex_wake04.c:126: Bug: wait_thread2 did not wake after 30 secs.

With the patch applied

futex_wake04    0  TINFO  :  Hugepagesize 2097152
futex_wake04    1  TPASS  :  Hi hydra, thread2 awake!

Fixes: 65d8fc777f6d "futex: Remove requirement for lock_page() in get_futex_key()"
Reported-and-tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20160608132522.GM2469@suse.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page;
+	struct page *page, *tail;
 	struct address_space *mapping;
 	int err, ro = 0;
 
@@ -530,7 +530,15 @@ again:
 	 * considered here and page lock forces unnecessarily serialization
 	 * From this point on, mapping will be re-verified if necessary and
 	 * page lock will be acquired only if it is unavoidable
-	 */
+	 *
+	 * Mapping checks require the head page for any compound page so the
+	 * head page and mapping is looked up now. For anonymous pages, it
+	 * does not matter if the page splits in the future as the key is
+	 * based on the address. For filesystem-backed pages, the tail is
+	 * required as the index of the page determines the key. For
+	 * base pages, there is no tail page and tail == page.
+	 */
+	tail = page;
 	page = compound_head(page);
 	mapping = READ_ONCE(page->mapping);
 
@@ -654,7 +662,7 @@ again:
 
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.inode = inode;
-		key->shared.pgoff = basepage_index(page);
+		key->shared.pgoff = basepage_index(tail);
 		rcu_read_unlock();
 	}
 
-- 
cgit v1.2.3


From ba62bafe942b159a6109cbec780d36496e06b6c5 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <yizhouzhou@ict.ac.cn>
Date: Wed, 8 Jun 2016 15:33:53 -0700
Subject: kernel/relay.c: fix potential memory leak

When relay_open_buf() fails in relay_open(), code will goto free_bufs,
but chan is nowhere freed.

Link: http://lkml.kernel.org/r/1464777927-19675-1-git-send-email-yizhouzhou@ict.ac.cn
Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 074994bcfa9b..04d7cf3ef8cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -614,6 +614,7 @@ free_bufs:
 
 	kref_put(&chan->kref, relay_destroy_channel);
 	mutex_unlock(&relay_channels_mutex);
+	kfree(chan);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(relay_open);
-- 
cgit v1.2.3


From 29d6455178a09e1dc340380c582b13356227e8df Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 1 Jun 2016 11:55:07 +0200
Subject: sched: panic on corrupted stack end

Until now, hitting this BUG_ON caused a recursive oops (because oops
handling involves do_exit(), which calls into the scheduler, which in
turn raises an oops), which caused stuff below the stack to be
overwritten until a panic happened (e.g.  via an oops in interrupt
context, caused by the overwritten CPU index in the thread_info).

Just panic directly.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d1f7149f8704..11546a6ed5df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3047,7 +3047,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(task_stack_end_corrupted(prev));
+	if (task_stack_end_corrupted(prev))
+		panic("corrupted stack end detected inside scheduler\n");
 #endif
 
 	if (unlikely(in_atomic_preempt_off())) {
-- 
cgit v1.2.3


From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 9 Jun 2016 15:07:50 +0200
Subject: sched/fair: Fix post_init_entity_util_avg() serialization

Chris Wilson reported a divide by 0 at:

 post_init_entity_util_avg():

 >    725	if (cfs_rq->avg.util_avg != 0) {
 >    726		sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 > -> 727		sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 >    728
 >    729		if (sa->util_avg > cap)
 >    730			sa->util_avg = cap;
 >    731	} else {

Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible:

	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
		sa->load_avg = max_t(long, sa->load_avg - r, 0);
		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
		removed_load = 1;
	}

turns into:

  ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
  ffffffff8108706b:       48 85 c0                test   %rax,%rax
  ffffffff8108706e:       74 40                   je     ffffffff810870b0
  ffffffff81087070:       4c 89 f8                mov    %r15,%rax
  ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
  ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
  ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
  ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
  ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
  ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx

Which you'll note ends up with 'sa->load_avg - r' in memory at
ffffffff8108707a.

By calling post_init_entity_util_avg() under rq->lock we're sure to be
fully serialized against PELT updates and cannot observe intermediate
state like this.

Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value")
Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +--
 kernel/sched/fair.c | 8 +++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d5394f5dc..13d0896aff87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-	/* Post initialize new task's util average when its cfs_rq is set */
+	rq = __task_rq_lock(p, &rf);
 	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p, &rf);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..4e33ad12bb68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
@@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+
+		raw_spin_lock_irq(&rq->lock);
 		post_init_entity_util_avg(se);
+		raw_spin_unlock_irq(&rq->lock);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Jun 2016 02:32:09 -0500
Subject: sched/debug: Fix deadlock when enabling sched events

I see a hang when enabling sched events:

  echo 1 > /sys/kernel/debug/tracing/events/sched/enable

The printk buffer shows:

  BUG: spinlock recursion on CPU#1, swapper/1/0
   lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1
  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
  ...
  Call Trace:
   <IRQ>  [<ffffffff8143d663>] dump_stack+0x85/0xc2
   [<ffffffff81115948>] spin_dump+0x78/0xc0
   [<ffffffff81115aea>] do_raw_spin_lock+0x11a/0x150
   [<ffffffff81891471>] _raw_spin_lock+0x61/0x80
   [<ffffffff810e5466>] ? try_to_wake_up+0x256/0x4e0
   [<ffffffff810e5466>] try_to_wake_up+0x256/0x4e0
   [<ffffffff81891a0a>] ? _raw_spin_unlock_irqrestore+0x4a/0x80
   [<ffffffff810e5705>] wake_up_process+0x15/0x20
   [<ffffffff810cebb4>] insert_work+0x84/0xc0
   [<ffffffff810ced7f>] __queue_work+0x18f/0x660
   [<ffffffff810cf9a6>] queue_work_on+0x46/0x90
   [<ffffffffa00cd95b>] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper]
   [<ffffffffa00cdac0>] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper]
   [<ffffffff814babcd>] soft_cursor+0x1ad/0x230
   [<ffffffff814ba379>] bit_cursor+0x649/0x680
   [<ffffffff814b9d30>] ? update_attr.isra.2+0x90/0x90
   [<ffffffff814b5e6a>] fbcon_cursor+0x14a/0x1c0
   [<ffffffff81555ef8>] hide_cursor+0x28/0x90
   [<ffffffff81558b6f>] vt_console_print+0x3bf/0x3f0
   [<ffffffff81122c63>] call_console_drivers.constprop.24+0x183/0x200
   [<ffffffff811241f4>] console_unlock+0x3d4/0x610
   [<ffffffff811247f5>] vprintk_emit+0x3c5/0x610
   [<ffffffff81124bc9>] vprintk_default+0x29/0x40
   [<ffffffff811e965b>] printk+0x57/0x73
   [<ffffffff810f7a9e>] enqueue_entity+0xc2e/0xc70
   [<ffffffff810f7b39>] enqueue_task_fair+0x59/0xab0
   [<ffffffff8106dcd9>] ? kvm_sched_clock_read+0x9/0x20
   [<ffffffff8103fb39>] ? sched_clock+0x9/0x10
   [<ffffffff810e3fcc>] activate_task+0x5c/0xa0
   [<ffffffff810e4514>] ttwu_do_activate+0x54/0xb0
   [<ffffffff810e5cea>] sched_ttwu_pending+0x7a/0xb0
   [<ffffffff810e5e51>] scheduler_ipi+0x61/0x170
   [<ffffffff81059e7f>] smp_trace_reschedule_interrupt+0x4f/0x2a0
   [<ffffffff81893ba6>] trace_reschedule_interrupt+0x96/0xa0
   <EOI>  [<ffffffff8106e0d6>] ? native_safe_halt+0x6/0x10
   [<ffffffff8110fb1d>] ? trace_hardirqs_on+0xd/0x10
   [<ffffffff81040ac0>] default_idle+0x20/0x1a0
   [<ffffffff8104147f>] arch_cpu_idle+0xf/0x20
   [<ffffffff81102f8f>] default_idle_call+0x2f/0x50
   [<ffffffff8110332e>] cpu_startup_entry+0x37e/0x450
   [<ffffffff8105af70>] start_secondary+0x160/0x1a0

Note the hang only occurs when echoing the above from a physical serial
console, not from an ssh session.

The bug is caused by a deadlock where the task is trying to grab the rq
lock twice because printk()'s aren't safe in sched code.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4e33ad12bb68..a2348deab7a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3246,7 +3246,7 @@ static inline void check_schedstat_required(void)
 			trace_sched_stat_iowait_enabled()  ||
 			trace_sched_stat_blocked_enabled() ||
 			trace_sched_stat_runtime_enabled())  {
-		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
 			     "stat_blocked and stat_runtime require the "
 			     "kernel parameter schedstats=enabled or "
 			     "kernel.sched_schedstats=1\n");
-- 
cgit v1.2.3


From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 9 Jun 2016 15:20:05 +0300
Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while
 processing sysrq-w

Lengthy output of sysrq-w may take a lot of time on slow serial console.

Currently we reset NMI-watchdog on the current CPU to avoid spurious
lockup messages. Sometimes this doesn't work since softlockup watchdog
might trigger on another CPU which is waiting for an IPI to proceed.
We reset softlockup watchdogs on all CPUs, but we do this only after
listing all tasks, and this may be too late on a busy system.

So, reset watchdogs CPUs earlier, in for_each_process_thread() loop.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13d0896aff87..4135ac83cb65 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter)
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
 		 */
 		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	}
 
-	touch_all_softlockup_watchdogs();
-
 #ifdef CONFIG_SCHED_DEBUG
 	if (!state_filter)
 		sysrq_sched_debug_show();
-- 
cgit v1.2.3


From df4565f9ebdc4d6dc50edc6e8fed08004e328332 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 24 May 2016 14:05:05 +0200
Subject: kernel/kcov: unproxify debugfs file's fops

Since commit 49d200deaa68 ("debugfs: prevent access to removed files'
private data"), a debugfs file's file_operations methods get proxied
through lifetime aware wrappers.

However, only a certain subset of the file_operations members is supported
by debugfs and ->mmap isn't among them -- it appears to be NULL from the
VFS layer's perspective.

This behaviour breaks the /sys/kernel/debug/kcov file introduced
concurrently with commit 5c9a8750a640 ("kernel: add kcov code coverage").

Since that file never gets removed, there is no file removal race and thus,
a lifetime checking proxy isn't needed.

Avoid the proxying for /sys/kernel/debug/kcov by creating it via
debugfs_create_file_unsafe() rather than debugfs_create_file().

Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Fixes: 5c9a8750a640 ("kernel: add kcov code coverage")
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/kcov.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
 
 static int __init kcov_init(void)
 {
-	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+	/*
+	 * The kcov debugfs file won't ever get removed and thus,
+	 * there is no need to protect it against removal races. The
+	 * use of debugfs_create_file_unsafe() is actually safe here.
+	 */
+	if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
 		pr_err("failed to create kcov in debugfs\n");
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 8974189222159154c55f24ddad33e3613960521a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 10:50:40 +0200
Subject: sched/fair: Fix cfs_rq avg tracking underflow

As per commit:

  b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization")

> the code generated from update_cfs_rq_load_avg():
>
> 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> 		removed_load = 1;
> 	}
>
> turns into:
>
> ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> ffffffff8108706b:       48 85 c0                test   %rax,%rax
> ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
> ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
>
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.

So I _should_ have looked at other unserialized users of ->load_avg,
but alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.

Aside from the intermediate value hitting memory and causing problems,
there's another problem: the underflow detection relies on the signed
bit. This reduces the effective width of the variables, IOW its
effectively the same as having these variables be of signed type.

This patch changes to a different means of unsigned underflow
detection to not rely on the signed bit. This allows the variables to
use the 'full' unsigned range. And it does so with explicit LOAD -
STORE to ensure any intermediate value will never be visible in
memory, allowing these unserialized loads.

Note: GCC generates crap code for this, might warrant a look later.

Note2: I say 'full' above, if we end up at U*_MAX we'll still explode;
       maybe we should do clamping on add too.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: kernel@kyup.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2348deab7a3..2ae68f0e3bf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	}
 }
 
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(*ptr) val = (_val);				\
+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
+	res = var - val;					\
+	if (res > var)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-		sa->load_avg = max_t(long, sa->load_avg - r, 0);
-		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(&sa->load_avg, r);
+		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
 		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-		sa->util_avg = max_t(long, sa->util_avg - r, 0);
-		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(&sa->util_avg, r);
+		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
 	}
 
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
-- 
cgit v1.2.3


From 70c8217acd4383e069fe1898bbad36ea4fcdbdcc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 17 Jun 2016 16:10:42 -0400
Subject: tracing: Handle NULL formats in hold_module_trace_bprintk_format()

If a task uses a non constant string for the format parameter in
trace_printk(), then the trace_printk_fmt variable is set to NULL. This
variable is then saved in the __trace_printk_fmt section.

The function hold_module_trace_bprintk_format() checks to see if duplicate
formats are used by modules, and reuses them if so (saves them to the list
if it is new). But this function calls lookup_format() that does a strcmp()
to the value (which is now NULL) and can cause a kernel oops.

This wasn't an issue till 3debb0a9ddb ("tracing: Fix trace_printk() to print
when not using bprintk()") which added "__used" to the trace_printk_fmt
variable, and before that, the kernel simply optimized it out (no NULL value
was saved).

The fix is simply to handle the NULL pointer in lookup_format() and have the
caller ignore the value if it was NULL.

Link: http://lkml.kernel.org/r/1464769870-18344-1-git-send-email-zhengjun.xing@intel.com

Reported-by: xingzhen <zhengjun.xing@intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Fixes: 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()")
Cc: stable@vger.kernel.org # v3.5+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_printk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
 	struct trace_bprintk_fmt *pos;
+
+	if (!fmt)
+		return ERR_PTR(-EINVAL);
+
 	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
 		if (!strcmp(pos->fmt, fmt))
 			return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 	for (iter = start; iter < end; iter++) {
 		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
 		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
+			if (!IS_ERR(tb_fmt))
+				*iter = tb_fmt->fmt;
 			continue;
 		}
 
-- 
cgit v1.2.3


From 6720a305df74ca30bcc10fc316881641b6ff0c80 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Jun 2016 12:11:17 -0700
Subject: locking: avoid passing around 'thread_info' in mutex debugging code

None of the code actually wants a thread_info, it all wants a
task_struct, and it's just converting back and forth between the two
("ti->task" to get the task_struct from the thread_info, and
"task_thread_info(task)" to go the other way).

No semantic change.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/mutex-debug.c | 12 ++++++------
 kernel/locking/mutex-debug.h |  4 ++--
 kernel/locking/mutex.c       |  6 +++---
 kernel/locking/mutex.h       |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			    struct thread_info *ti)
+			    struct task_struct *task)
 {
 	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
 
 	/* Mark the current thread as blocked on the lock: */
-	ti->task->blocked_on = waiter;
+	task->blocked_on = waiter;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			 struct thread_info *ti)
+			 struct task_struct *task)
 {
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
-	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
-	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
-	ti->task->blocked_on = NULL;
+	DEBUG_LOCKS_WARN_ON(waiter->task != task);
+	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+	task->blocked_on = NULL;
 
 	list_del_init(&waiter->list);
 	waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..d06ae3bb46c5 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
 				   struct mutex_waiter *waiter,
-				   struct thread_info *ti);
+				   struct task_struct *task);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-				struct thread_info *ti);
+				struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 79d2d765a75f..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
-	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+	debug_mutex_add_waiter(lock, &waiter, task);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
@@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	__set_task_state(task, TASK_RUNNING);
 
-	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_remove_waiter(lock, &waiter, task);
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -605,7 +605,7 @@ skip_wait:
 	return 0;
 
 err:
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, task);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..a68bae5e852a 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
 		do { spin_lock(lock); (void)(flags); } while (0)
 #define spin_unlock_mutex(lock, flags) \
 		do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-- 
cgit v1.2.3


From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 21 Jun 2016 18:52:17 +0200
Subject: locking/static_key: Fix concurrent static_key_slow_inc()

The following scenario is possible:

    CPU 1                                   CPU 2
    static_key_slow_inc()
     atomic_inc_not_zero()
      -> key.enabled == 0, no increment
     jump_label_lock()
     atomic_inc_return()
      -> key.enabled == 1 now
                                            static_key_slow_inc()
                                             atomic_inc_not_zero()
                                              -> key.enabled == 1, inc to 2
                                             return
                                            ** static key is wrong!
     jump_label_update()
     jump_label_unlock()

Testing the static key at the point marked by (**) will follow the
wrong path for jumps that have not been patched yet.  This can
actually happen when creating many KVM virtual machines with userspace
LAPIC emulation; just run several copies of the following program:

    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/ioctl.h>
    #include <linux/kvm.h>

    int main(void)
    {
        for (;;) {
            int kvmfd = open("/dev/kvm", O_RDONLY);
            int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
            close(ioctl(vmfd, KVM_CREATE_VCPU, 1));
            close(vmfd);
            close(kvmfd);
        }
        return 0;
    }

Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call.
The static key's purpose is to skip NULL pointer checks and indeed one
of the processes eventually dereferences NULL.

As explained in the commit that introduced the bug:

  706249c222f6 ("locking/static_keys: Rework update logic")

jump_label_update() needs key.enabled to be true.  The solution adopted
here is to temporarily make key.enabled == -1, and use go down the
slow path when key.enabled <= 0.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 706249c222f6 ("locking/static_keys: Rework update logic")
Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com
[ Small stylistic edits to the changelog and the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..4b353e0be121 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
 
 void static_key_slow_inc(struct static_key *key)
 {
+	int v, v1;
+
 	STATIC_KEY_CHECK_USE();
-	if (atomic_inc_not_zero(&key->enabled))
-		return;
+
+	/*
+	 * Careful if we get concurrent static_key_slow_inc() calls;
+	 * later calls must wait for the first one to _finish_ the
+	 * jump_label_update() process.  At the same time, however,
+	 * the jump_label_update() call below wants to see
+	 * static_key_enabled(&key) for jumps to be updated properly.
+	 *
+	 * So give a special meaning to negative key->enabled: it sends
+	 * static_key_slow_inc() down the slow path, and it is non-zero
+	 * so it counts as "enabled" in jump_label_update().  Note that
+	 * atomic_inc_unless_negative() checks >= 0, so roll our own.
+	 */
+	for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+		v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+		if (likely(v1 == v))
+			return;
+	}
 
 	jump_label_lock();
-	if (atomic_inc_return(&key->enabled) == 1)
+	if (atomic_read(&key->enabled) == 0) {
+		atomic_set(&key->enabled, -1);
 		jump_label_update(key);
+		atomic_set(&key->enabled, 1);
+	} else {
+		atomic_inc(&key->enabled);
+	}
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
+	/*
+	 * The negative count check is valid even when a negative
+	 * key->enabled is in use by static_key_slow_inc(); a
+	 * __static_key_slow_dec() before the first static_key_slow_inc()
+	 * returns is unbalanced, because all other static_key_slow_inc()
+	 * instances block while the update is in progress.
+	 */
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-- 
cgit v1.2.3


From 094f469172e00d6ab0a3130b0e01c83b3cf3a98d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 16 Jun 2016 15:57:01 +0300
Subject: sched/fair: Initialize throttle_count for new task-groups lazily

Cgroup created inside throttled group must inherit current throttle_count.
Broken throttle_count allows to nominate throttled entries as a next buddy,
later this leads to null pointer dereference in pick_next_task_fair().

This patch initialize cfs_rq->throttle_count at first enqueue: laziness
allows to skip locking all rq at group creation. Lazy approach also allows
to skip full sub-tree scan at throttling hierarchy (not in this patch).

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 20 ++++++++++++++++++++
 kernel/sched/sched.h |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ae68f0e3bf5..8c5d8c0c8827 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4202,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 	if (!cfs_bandwidth_used())
 		return;
 
+	/* Synchronize hierarchical throttle counter: */
+	if (unlikely(!cfs_rq->throttle_uptodate)) {
+		struct rq *rq = rq_of(cfs_rq);
+		struct cfs_rq *pcfs_rq;
+		struct task_group *tg;
+
+		cfs_rq->throttle_uptodate = 1;
+
+		/* Get closest up-to-date node, because leaves go first: */
+		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+			if (pcfs_rq->throttle_uptodate)
+				break;
+		}
+		if (tg) {
+			cfs_rq->throttle_count = pcfs_rq->throttle_count;
+			cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		}
+	}
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..7cbeb92a1cb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -437,7 +437,7 @@ struct cfs_rq {
 
 	u64 throttled_clock, throttled_clock_task;
 	u64 throttled_clock_task_time;
-	int throttled, throttle_count;
+	int throttled, throttle_count, throttle_uptodate;
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.2.3


From 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 16 Jun 2016 15:57:15 +0300
Subject: sched/fair: Do not announce throttled next buddy in
 dequeue_task_fair()

Hierarchy could be already throttled at this point. Throttled next
buddy could trigger a NULL pointer dereference in pick_next_task_fair().

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c5d8c0c8827..bdcbeea90c95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4537,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
+			/* Avoid re-evaluating load for this entity: */
+			se = parent_entity(se);
 			/*
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
-			if (task_sleep && parent_entity(se))
-				set_next_buddy(parent_entity(se));
-
-			/* avoid re-evaluating load for this entity */
-			se = parent_entity(se);
+			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+				set_next_buddy(se);
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
-- 
cgit v1.2.3


From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 16 Jun 2016 15:35:04 -0400
Subject: sched/core: Allow kthreads to fall back to online && !active cpus

During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is
online but not active.  A CPU_ONLINE callback may create or bind a
kthread so that its cpus_allowed mask only allows the CPU which is
being brought online.  The kthread may start executing before the CPU
is made active and can end up in select_fallback_rq().

In such cases, the expected behavior is selecting the CPU which is
coming online; however, because select_fallback_rq() only chooses from
active CPUs, it determines that the task doesn't have any viable CPU
in its allowed mask and ends up overriding it to cpu_possible_mask.

CPU_ONLINE callbacks should be able to put kthreads on the CPU which
is coming online.  Update select_fallback_rq() so that it follows
cpu_online() rather than cpu_active() for kthreads.

Reported-by: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4135ac83cb65..51d7105f529a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!cpu_active(dest_cpu))
+			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+				continue;
+			if (!cpu_online(dest_cpu))
 				continue;
 			goto out;
 		}
-- 
cgit v1.2.3


From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 15:09:37 -0700
Subject: Clarify naming of thread info/stack allocators

We've had the thread info allocated together with the thread stack for
most architectures for a long time (since the thread_info was split off
from the task struct), but that is about to change.

But the patches that move the thread info to be off-stack (and a part of
the task struct instead) made it clear how confused the allocator and
freeing functions are.

Because the common case was that we share an allocation with the thread
stack and the thread_info, the two pointers were identical.  That
identity then meant that we would have things like

	ti = alloc_thread_info_node(tsk, node);
	...
	tsk->stack = ti;

which certainly _worked_ (since stack and thread_info have the same
value), but is rather confusing: why are we assigning a thread_info to
the stack? And if we move the thread_info away, the "confusing" code
just gets to be entirely bogus.

So remove all this confusion, and make it clear that we are doing the
stack allocation by renaming and clarifying the function names to be
about the stack.  The fact that the thread_info then shares the
allocation is an implementation detail, and not really about the
allocation itself.

This is a pure renaming and type fix: we pass in the same pointer, it's
just that we clarify what the pointer means.

The ia64 code that actually only has one single allocation (for all of
task_struct, thread_info and kernel thread stack) now looks a bit odd,
but since "tsk->stack" is actually not even used there, that oddity
doesn't matter.  It would be a separate thing to clean that up, I
intentionally left the ia64 changes as a pure brute-force renaming and
type change.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..37b9439b8c07 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
 
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-	struct page *page = virt_to_page(ti);
+	struct page *page = virt_to_page(stack);
 
 	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 				    -(1 << THREAD_SIZE_ORDER));
 	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_stack(unsigned long *stack)
 {
-	kmem_cache_free(thread_info_cache, ti);
+	kmem_cache_free(thread_stack_cache, stack);
 }
 
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
 					      THREAD_SIZE, 0, NULL);
-	BUG_ON(thread_info_cache == NULL);
+	BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(ti));
+	struct zone *zone = page_zone(virt_to_page(stack));
 
 	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 }
@@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
 	account_kernel_stack(tsk->stack, -1);
-	arch_release_thread_info(tsk->stack);
-	free_thread_info(tsk->stack);
+	arch_release_thread_stack(tsk->stack);
+	free_thread_stack(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
@@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
-	struct thread_info *ti;
+	unsigned long *stack;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info_node(tsk, node);
-	if (!ti)
+	stack = alloc_thread_stack_node(tsk, node);
+	if (!stack)
 		goto free_tsk;
 
 	err = arch_dup_task_struct(tsk, orig);
 	if (err)
-		goto free_ti;
+		goto free_stack;
 
-	tsk->stack = ti;
+	tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->task_frag.page = NULL;
 	tsk->wake_q.next = NULL;
 
-	account_kernel_stack(ti, 1);
+	account_kernel_stack(stack, 1);
 
 	kcov_task_init(tsk);
 
 	return tsk;
 
-free_ti:
-	free_thread_info(ti);
+free_stack:
+	free_thread_stack(stack);
 free_tsk:
 	free_task_struct(tsk);
 	return NULL;
-- 
cgit v1.2.3


From 74070542099c66d87aebeacd7b54dc0e8b6a73f9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 24 Jun 2016 14:50:16 -0700
Subject: oom, suspend: fix oom_reaper vs. oom_killer_disable race

Tetsuo has reported the following potential oom_killer_disable vs.
oom_reaper race:

 (1) freeze_processes() starts freezing user space threads.
 (2) Somebody (maybe a kenrel thread) calls out_of_memory().
 (3) The OOM killer calls mark_oom_victim() on a user space thread
     P1 which is already in __refrigerator().
 (4) oom_killer_disable() sets oom_killer_disabled = true.
 (5) P1 leaves __refrigerator() and enters do_exit().
 (6) The OOM reaper calls exit_oom_victim(P1) before P1 can call
     exit_oom_victim(P1).
 (7) oom_killer_disable() returns while P1 not yet finished
 (8) P1 perform IO/interfere with the freezer.

This situation is unfortunate.  We cannot move oom_killer_disable after
all the freezable kernel threads are frozen because the oom victim might
depend on some of those kthreads to make a forward progress to exit so
we could deadlock.  It is also far from trivial to teach the oom_reaper
to not call exit_oom_victim() because then we would lose a guarantee of
the OOM killer and oom_killer_disable forward progress because
exit_mm->mmput might block and never call exit_oom_victim.

It seems the easiest way forward is to workaround this race by calling
try_to_freeze_tasks again after oom_killer_disable.  This will make sure
that all the tasks are frozen or it bails out.

Fixes: 449d777d7ad6 ("mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper")
Link: http://lkml.kernel.org/r/1466597634-16199-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..0c2ee9761d57 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -146,6 +146,18 @@ int freeze_processes(void)
 	if (!error && !oom_killer_disable())
 		error = -EBUSY;
 
+	/*
+	 * There is a hard to fix race between oom_reaper kernel thread
+	 * and oom_killer_disable. oom_reaper calls exit_oom_victim
+	 * before the victim reaches exit_mm so try to freeze all the tasks
+	 * again and catch such a left over task.
+	 */
+	if (!error) {
+		pr_info("Double checking all user space processes after OOM killer disable... ");
+		error = try_to_freeze_tasks(true);
+		pr_cont("\n");
+	}
+
 	if (error)
 		thaw_processes();
 	return error;
-- 
cgit v1.2.3


From 9521d39976db20f8ef9b56af66661482a17d5364 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sat, 25 Jun 2016 21:53:30 +1000
Subject: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE

Commit b235beea9e99 ("Clarify naming of thread info/stack allocators")
breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE:

  kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack'
  kernel/fork.c:355:8: error: assignment from incompatible pointer type
    stack = alloc_thread_stack_node(tsk, node);
    ^

Fix it by renaming free_stack() to free_thread_stack(), and updating the
return type of alloc_thread_stack_node().

Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 37b9439b8c07..4a7ec0c6c88c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -183,13 +183,13 @@ static inline void free_thread_stack(unsigned long *stack)
 # else
 static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_stack(unsigned long *stack)
+static void free_thread_stack(unsigned long *stack)
 {
 	kmem_cache_free(thread_stack_cache, stack);
 }
-- 
cgit v1.2.3