From c269cba35b061181bc23c470809c00e8f71e535a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 15:02:07 +0100 Subject: memremap: add arch specific hook for MEMREMAP_WB mappings Currently, the memremap code serves MEMREMAP_WB mappings directly from the kernel direct mapping, unless the region is in high memory, in which case it falls back to using ioremap_cache(). However, the semantics of ioremap_cache() are not unambiguously defined, and on ARM, it will actually result in a mapping type that differs from the attributes used for the linear mapping, and for this reason, the ioremap_cache() call fails if the region is part of the memory managed by the kernel. So instead, implement an optional hook 'arch_memremap_wb' whose default implementation calls ioremap_cache() as before, but which can be overridden by the architecture to do what is appropriate for it. Acked-by: Dan Williams Signed-off-by: Ard Biesheuvel --- kernel/memremap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a6d382312e6f..017532193fb1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) } #endif +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + static void *try_ram_remap(resource_size_t offset, size_t size) { unsigned long pfn = PHYS_PFN(offset); @@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size) /* In the simple case just return the existing linear address */ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); - return NULL; /* fallback to ioremap_cache */ + return NULL; /* fallback to arch_memremap_wb */ } /** @@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); if (!addr) - addr = ioremap_cache(offset, size); + addr = arch_memremap_wb(offset, size); } /* -- cgit v1.2.3 From 0bf676d1fd5144e66ac06939fa3c7c12086c3b18 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Mar 2016 10:49:28 +0200 Subject: audit: cleanup prune_tree_thread We can use kthread_run instead of kthread_create+wake_up_process for creating the thread. We do not need to set the task state to TASK_RUNNING after schedule(), the process is in that state already. And we do not need to set the state to TASK_INTERRUPTIBLE when not doing schedule() as we set the state to TASK_RUNNING immediately afterwards. Signed-off-by: Jiri Slaby Cc: Paul Moore Cc: Eric Paris Cc: Signed-off-by: Paul Moore --- kernel/audit_tree.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5efe9b299a12..25772476fa4a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg) static int prune_tree_thread(void *unused) { for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty(&prune_list)) + if (list_empty(&prune_list)) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); + } mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); @@ -693,16 +693,14 @@ static int audit_launch_prune(void) { if (prune_thread) return 0; - prune_thread = kthread_create(prune_tree_thread, NULL, + prune_thread = kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); if (IS_ERR(prune_thread)) { pr_err("cannot start thread audit_prune_tree"); prune_thread = NULL; return -ENOMEM; - } else { - wake_up_process(prune_thread); - return 0; } + return 0; } /* called with audit_filter_mutex */ -- cgit v1.2.3 From 7ffb8e317bae03b8ee5bdcec93dc3723be945e9b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 4 Apr 2016 16:44:02 -0400 Subject: audit: we don't need to __set_current_state(TASK_RUNNING) Remove the calls to __set_current_state() to mark the task as running and do some related cleanup in wait_for_auditd() to limit the amount of work we do when we aren't going to reschedule the current task. Signed-off-by: Paul Moore --- kernel/audit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3a3e5deeda8d..f52fbefede09 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -430,7 +430,6 @@ restart: attempts, audit_pid); set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); goto restart; } } @@ -1324,15 +1323,14 @@ static inline void audit_get_stamp(struct audit_context *ctx, static long wait_for_auditd(long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue_exclusive(&audit_backlog_wait, &wait); if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); sleep_time = schedule_timeout(sleep_time); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + remove_wait_queue(&audit_backlog_wait, &wait); + } return sleep_time; } -- cgit v1.2.3 From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:24 +0100 Subject: KEYS: Generalise system_verify_data() to provide access to internal content Generalise system_verify_data() to provide access to internal content through a callback. This allows all the PKCS#7 stuff to be hidden inside this function and removed from the PE file parser and the PKCS#7 test key. If external content is not required, NULL should be passed as data to the function. If the callback is not required, that can be set to NULL. The function is now called verify_pkcs7_signature() to contrast with verify_pefile_signature() and the definitions of both have been moved into linux/verification.h along with the key_being_used_for enum. Signed-off-by: David Howells --- kernel/module_signing.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 64b9dead4a07..593aace88a02 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -EBADMSG; } - return system_verify_data(mod, modlen, mod + modlen, sig_len, - VERIFYING_MODULE_SIGNATURE); + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } -- cgit v1.2.3 From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:24 +0100 Subject: PKCS#7: Make trust determination dependent on contents of trust keyring Make the determination of the trustworthiness of a key dependent on whether a key that can verify it is present in the supplied ring of trusted keys rather than whether or not the verifying key has KEY_FLAG_TRUSTED set. verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust chain cannot be verified. Signed-off-by: David Howells --- kernel/module_signing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 593aace88a02..6a64e03b9f44 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) } return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE, + NULL, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } -- cgit v1.2.3 From a511e1af8b12f44c6e55786c463c9f093c214fb6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:26 +0100 Subject: KEYS: Move the point of trust determination to __key_link() Move the point at which a key is determined to be trustworthy to __key_link() so that we use the contents of the keyring being linked in to to determine whether the key being linked in is trusted or not. What is 'trusted' then becomes a matter of what's in the keyring. Currently, the test is done when the key is parsed, but given that at that point we can only sensibly refer to the contents of the system trusted keyring, we can only use that as the basis for working out the trustworthiness of a new key. With this change, a trusted keyring is a set of keys that once the trusted-only flag is set cannot be added to except by verification through one of the contained keys. Further, adding a key into a trusted keyring, whilst it might grant trustworthiness in the context of that keyring, does not automatically grant trustworthiness in the context of a second keyring to which it could be secondarily linked. To accomplish this, the authentication data associated with the key source must now be retained. For an X.509 cert, this means the contents of the AuthorityKeyIdentifier and the signature data. If system keyrings are disabled then restrict_link_by_builtin_trusted() resolves to restrict_link_reject(). The integrity digital signature code still works correctly with this as it was previously using KEY_FLAG_TRUSTED_ONLY, which doesn't permit anything to be added if there is no system keyring against which trust can be determined. Signed-off-by: David Howells --- kernel/module_signing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6a64e03b9f44..937c844bee4a 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include "module-internal.h" -- cgit v1.2.3 From 9ebc57cfaad21aacbc363eecead579269a46b493 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2016 20:39:55 -0400 Subject: tracing: Rename check_ignore_pid() to ignore_this_task() The name "check_ignore_pid" is confusing in trying to figure out if the pid should be ignored or not. Rename it to "ignore_this_task" which is pretty straight forward, as a task (not a pid) is passed in, and should if true should be ignored. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..598a18675a6b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -484,7 +484,7 @@ static int cmp_pid(const void *key, const void *elt) } static bool -check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { pid_t search_pid; pid_t *pid; @@ -517,8 +517,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, prev) && - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, prev) && + ignore_this_task(pid_list, next)); } static void @@ -531,7 +531,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, next)); } static void @@ -547,7 +547,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, task)); + ignore_this_task(pid_list, task)); } static void @@ -564,7 +564,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -1561,7 +1561,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static ssize_t -- cgit v1.2.3 From f4d34a87e9c10f0ffd03d3548db6bfb200d06cdf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2016 16:27:49 -0400 Subject: tracing: Use pid bitmap instead of a pid array for set_event_pid In order to add the ability to let tasks that are filtered by the events have their children also be traced on fork (and then not traced on exit), convert the array into a pid bitmask. Most of the time the number of pids is only 32768 pids or a 4k bitmask, which is the same size as the default list currently is, and that list could grow if more pids are listed. This also greatly simplifies the code. Suggested-by: "H. Peter Anvin" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 +- kernel/trace/trace_events.c | 221 ++++++++++++++++++++------------------------ 2 files changed, 102 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3fff4adfd431..68cbb8e10aea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,9 +177,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned int nr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 598a18675a6b..45f7cc72bf25 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -static int cmp_pid(const void *key, const void *elt) -{ - const pid_t *search_pid = key; - const pid_t *pid = elt; - - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; -} +/* Shouldn't this be in a header? */ +extern int pid_max; static bool ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { - pid_t search_pid; - pid_t *pid; + pid_t pid; /* * Return false, because if filtered_pids does not exist, @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - search_pid = task->pid; + pid = task->pid; - pid = bsearch(&search_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (task->pid >= filtered_pids->pid_max) return true; - return false; + return !test_bit(task->pid, filtered_pids->pids); } static void @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); } @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; + unsigned long pid; + loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) + return NULL; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) return NULL; - return (void *)&pid_list->pids[*pos]; + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)p_next(m, (void *)pid, &l)) + ; + return (void *)pid; } static void p_stop(struct seq_file *m, void *p) @@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static void * -p_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - - (*pos)++; - - if (*pos >= pid_list->nr_pids) - return NULL; - - return (void *)&pid_list->pids[*pos]; -} - static int p_show(struct seq_file *m, void *v) { - pid_t *pid = v; + unsigned long pid = (unsigned long)v - 1; - seq_printf(m, "%d\n", *pid); + seq_printf(m, "%lu\n", pid); return 0; } @@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } -static int max_pids(struct trace_pid_list *pid_list) -{ - return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); -} - static void ignore_task_cpu(void *data) { struct trace_array *tr = data; @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; - struct trace_pid_list *pid_list = NULL; + struct trace_pid_list *pid_list; struct trace_event_file *file; struct trace_parser parser; unsigned long val; @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ssize_t read = 0; ssize_t ret = 0; pid_t pid; - int i; + int nr_pids = 0; if (!cnt) return 0; @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, return -ENOMEM; mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + /* - * Load as many pids into the array before doing a - * swap from the tr->filtered_pids to the new list. + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) { + read = -ENOMEM; + goto out; + } + pid_list->pid_max = READ_ONCE(pid_max); + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + read = -ENOMEM; + goto out; + } + if (filtered_pids) { + /* copy the current bits to the new max */ + pid = find_first_bit(filtered_pids->pids, + filtered_pids->pid_max); + while (pid < filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + pid = find_next_bit(filtered_pids->pids, + filtered_pids->pid_max, + pid + 1); + nr_pids++; + } + } + while (cnt > 0) { this_pos = 0; @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val > INT_MAX) + if (val >= pid_list->pid_max) break; pid = (pid_t)val; - ret = -ENOMEM; - if (!pid_list) { - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) - break; - - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (filtered_pids) - pid_list->order = filtered_pids->order; - else - pid_list->order = 0; - - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order); - if (!pid_list->pids) - break; - - if (filtered_pids) { - pid_list->nr_pids = filtered_pids->nr_pids; - memcpy(pid_list->pids, filtered_pids->pids, - pid_list->nr_pids * sizeof(pid_t)); - } else - pid_list->nr_pids = 0; - } - - if (pid_list->nr_pids >= max_pids(pid_list)) { - pid_t *pid_page; - - pid_page = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order + 1); - if (!pid_page) - break; - memcpy(pid_page, pid_list->pids, - pid_list->nr_pids * sizeof(pid_t)); - free_pages((unsigned long)pid_list->pids, pid_list->order); - - pid_list->order++; - pid_list->pids = pid_page; - } + set_bit(pid, pid_list->pids); + nr_pids++; - pid_list->pids[pid_list->nr_pids++] = pid; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { - if (pid_list) - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); - mutex_unlock(&event_mutex); - return ret; - } - - if (!pid_list) { - mutex_unlock(&event_mutex); - return ret; + read = ret; + goto out; } - sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); - - /* Remove duplicates */ - for (i = 1; i < pid_list->nr_pids; i++) { - int start = i; - - while (i < pid_list->nr_pids && - pid_list->pids[i - 1] == pid_list->pids[i]) - i++; - - if (start != i) { - if (i < pid_list->nr_pids) { - memmove(&pid_list->pids[start], &pid_list->pids[i], - (pid_list->nr_pids - i) * sizeof(pid_t)); - pid_list->nr_pids -= i - start; - i = start; - } else - pid_list->nr_pids = start; - } + if (!nr_pids) { + /* Cleared the list of pids */ + vfree(pid_list->pids); + kfree(pid_list); + read = ret; + if (!filtered_pids) + goto out; + pid_list = NULL; } - rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + vfree(filtered_pids->pids); kfree(filtered_pids); } else { /* @@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); + out: mutex_unlock(&event_mutex); ret = read; - *ppos += read; + if (read > 0) + *ppos += read; return ret; } -- cgit v1.2.3 From c37775d57830a36382a9774bb84eca4ce3d019cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2016 16:59:18 -0400 Subject: tracing: Add infrastructure to allow set_event_pid to follow children Add the infrastructure needed to have the PIDs in set_event_pid to automatically add PIDs of the children of the tasks that have their PIDs in set_event_pid. This will also remove PIDs from set_event_pid when a task exits This is implemented by adding hooks into the fork and exit tracepoints. On fork, the PIDs are added to the list, and on exit, they are removed. Add a new option called event_fork that when set, PIDs in set_event_pid will automatically get their children PIDs added when they fork, as well as any task that exits will have its PID removed from set_event_pid. This works for instances as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 84 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 79 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a2f0b9f33e9b..0d12dbde8399 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3571,6 +3571,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_EVENT_FORK) + trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68cbb8e10aea..2525042760e6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -655,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -966,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ + C(EVENT_FORK, "event-fork"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f7cc72bf25..add81dff7520 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -474,11 +474,23 @@ static void ftrace_clear_events(struct trace_array *tr) /* Shouldn't this be in a header? */ extern int pid_max; +/* Returns true if found in filter */ static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - pid_t pid; + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} +static bool +ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ /* * Return false, because if filtered_pids does not exist, * all pids are good to trace. @@ -486,16 +498,68 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - pid = task->pid; + return !find_filtered_pid(filtered_pids, task->pid); +} - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (task->pid >= filtered_pids->pid_max) - return true; +static void filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +static void +event_filter_pid_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, NULL, task); +} - return !test_bit(task->pid, filtered_pids->pids); +static void +event_filter_pid_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, self, task); +} + +void trace_event_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, + tr, INT_MIN); + register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + tr, INT_MAX); + } else { + unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, + tr); + unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + tr); + } } static void -- cgit v1.2.3 From 08d43a5fa063e03c860f2f391a30c388bcbc948e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:50 -0600 Subject: tracing: Add lock-free tracing_map Add tracing_map, a special-purpose lock-free map for tracing. tracing_map is designed to aggregate or 'sum' one or more values associated with a specific object of type tracing_map_elt, which is associated by the map to a given key. It provides various hooks allowing per-tracer customization and is separated out into a separate file in order to allow it to be shared between multiple tracers, but isn't meant to be generally used outside of that context. The tracing_map implementation was inspired by lock-free map algorithms originated by Dr. Cliff Click: http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf Link: http://lkml.kernel.org/r/b43d68d1add33582a396f553c8ef705a33a6a748.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 13 + kernel/trace/Makefile | 1 + kernel/trace/tracing_map.c | 1058 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/tracing_map.h | 282 ++++++++++++ 4 files changed, 1354 insertions(+) create mode 100644 kernel/trace/tracing_map.c create mode 100644 kernel/trace/tracing_map.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..48b732943e0e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -528,6 +528,19 @@ config MMIOTRACE See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. +config TRACING_MAP + bool + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + default n + help + tracing_map is a special-purpose lock-free map for tracing, + separated out as a stand-alone facility in order to allow it + to be shared between multiple tracers. It isn't meant to be + generally used outside of that context, and is normally + selected by tracers that use it. + + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..4255c4057aaa 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c new file mode 100644 index 000000000000..dd6401dd145e --- /dev/null +++ b/kernel/trace/tracing_map.c @@ -0,0 +1,1058 @@ +/* + * tracing_map - lock-free map for tracing + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi + * + * tracing_map implementation inspired by lock-free map algorithms + * originated by Dr. Cliff Click: + * + * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable + * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf + */ + +#include +#include +#include +#include + +#include "tracing_map.h" +#include "trace.h" + +/* + * NOTE: For a detailed description of the data structures used by + * these functions (such as tracing_map_elt) please see the overview + * of tracing_map data structures at the beginning of tracing_map.h. + */ + +/** + * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * @n: The value to add to the sum + * + * Add n to sum i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_sum_field() when the tracing map was set up. + */ +void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_add(n, &elt->fields[i].sum); +} + +/** + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * + * Retrieve the value of the sum i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_sum_field() when the tracing map was set + * up. + * + * Return: The sum associated with field i for elt. + */ +u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->fields[i].sum); +} + +int tracing_map_cmp_string(void *val_a, void *val_b) +{ + char *a = val_a; + char *b = val_b; + + return strcmp(a, b); +} + +int tracing_map_cmp_none(void *val_a, void *val_b) +{ + return 0; +} + +static int tracing_map_cmp_atomic64(void *val_a, void *val_b) +{ + u64 a = atomic64_read((atomic64_t *)val_a); + u64 b = atomic64_read((atomic64_t *)val_b); + + return (a > b) ? 1 : ((a < b) ? -1 : 0); +} + +#define DEFINE_TRACING_MAP_CMP_FN(type) \ +static int tracing_map_cmp_##type(void *val_a, void *val_b) \ +{ \ + type a = *(type *)val_a; \ + type b = *(type *)val_b; \ + \ + return (a > b) ? 1 : ((a < b) ? -1 : 0); \ +} + +DEFINE_TRACING_MAP_CMP_FN(s64); +DEFINE_TRACING_MAP_CMP_FN(u64); +DEFINE_TRACING_MAP_CMP_FN(s32); +DEFINE_TRACING_MAP_CMP_FN(u32); +DEFINE_TRACING_MAP_CMP_FN(s16); +DEFINE_TRACING_MAP_CMP_FN(u16); +DEFINE_TRACING_MAP_CMP_FN(s8); +DEFINE_TRACING_MAP_CMP_FN(u8); + +tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed) +{ + tracing_map_cmp_fn_t fn = tracing_map_cmp_none; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = tracing_map_cmp_s64; + else + fn = tracing_map_cmp_u64; + break; + case 4: + if (field_is_signed) + fn = tracing_map_cmp_s32; + else + fn = tracing_map_cmp_u32; + break; + case 2: + if (field_is_signed) + fn = tracing_map_cmp_s16; + else + fn = tracing_map_cmp_u16; + break; + case 1: + if (field_is_signed) + fn = tracing_map_cmp_s8; + else + fn = tracing_map_cmp_u8; + break; + } + + return fn; +} + +static int tracing_map_add_field(struct tracing_map *map, + tracing_map_cmp_fn_t cmp_fn) +{ + int ret = -EINVAL; + + if (map->n_fields < TRACING_MAP_FIELDS_MAX) { + ret = map->n_fields; + map->fields[map->n_fields++].cmp_fn = cmp_fn; + } + + return ret; +} + +/** + * tracing_map_add_sum_field - Add a field describing a tracing_map sum + * @map: The tracing_map + * + * Add a sum field to the key and return the index identifying it in + * the map and associated tracing_map_elts. This is the index used + * for instance to update a sum for a particular tracing_map_elt using + * tracing_map_update_sum() or reading it via tracing_map_read_sum(). + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts. + */ +int tracing_map_add_sum_field(struct tracing_map *map) +{ + return tracing_map_add_field(map, tracing_map_cmp_atomic64); +} + +/** + * tracing_map_add_key_field - Add a field describing a tracing_map key + * @map: The tracing_map + * @offset: The offset within the key + * @cmp_fn: The comparison function that will be used to sort on the key + * + * Let the map know there is a key and that if it's used as a sort key + * to use cmp_fn. + * + * A key can be a subset of a compound key; for that purpose, the + * offset param is used to describe where within the the compound key + * the key referenced by this key field resides. + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts. + */ +int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn) + +{ + int idx = tracing_map_add_field(map, cmp_fn); + + if (idx < 0) + return idx; + + map->fields[idx].offset = offset; + + map->key_idx[map->n_keys++] = idx; + + return idx; +} + +void tracing_map_array_clear(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a->pages) + return; + + for (i = 0; i < a->n_pages; i++) + memset(a->pages[i], 0, PAGE_SIZE); +} + +void tracing_map_array_free(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a) + return; + + if (!a->pages) { + kfree(a); + return; + } + + for (i = 0; i < a->n_pages; i++) { + if (!a->pages[i]) + break; + free_page((unsigned long)a->pages[i]); + } +} + +struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, + unsigned int entry_size) +{ + struct tracing_map_array *a; + unsigned int i; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return NULL; + + a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); + a->n_pages = n_elts / a->entries_per_page; + if (!a->n_pages) + a->n_pages = 1; + a->entry_shift = fls(a->entries_per_page) - 1; + a->entry_mask = (1 << a->entry_shift) - 1; + + a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!a->pages[i]) + goto free; + } + out: + return a; + free: + tracing_map_array_free(a); + a = NULL; + + goto out; +} + +static void tracing_map_elt_clear(struct tracing_map_elt *elt) +{ + unsigned i; + + for (i = 0; i < elt->map->n_fields; i++) + if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) + atomic64_set(&elt->fields[i].sum, 0); + + if (elt->map->ops && elt->map->ops->elt_clear) + elt->map->ops->elt_clear(elt); +} + +static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) +{ + unsigned int i; + + tracing_map_elt_clear(elt); + + for (i = 0; i < elt->map->n_fields; i++) { + elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn; + + if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64) + elt->fields[i].offset = elt->map->fields[i].offset; + } +} + +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + kfree(elt->fields); + kfree(elt->key); + kfree(elt); +} + +static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) +{ + struct tracing_map_elt *elt; + int err = 0; + + elt = kzalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return ERR_PTR(-ENOMEM); + + elt->map = map; + + elt->key = kzalloc(map->key_size, GFP_KERNEL); + if (!elt->key) { + err = -ENOMEM; + goto free; + } + + elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + if (!elt->fields) { + err = -ENOMEM; + goto free; + } + + tracing_map_elt_init_fields(elt); + + if (map->ops && map->ops->elt_alloc) { + err = map->ops->elt_alloc(elt); + if (err) + goto free; + } + return elt; + free: + tracing_map_elt_free(elt); + + return ERR_PTR(err); +} + +static struct tracing_map_elt *get_free_elt(struct tracing_map *map) +{ + struct tracing_map_elt *elt = NULL; + int idx; + + idx = atomic_inc_return(&map->next_elt); + if (idx < map->max_elts) { + elt = *(TRACING_MAP_ELT(map->elts, idx)); + if (map->ops && map->ops->elt_init) + map->ops->elt_init(elt); + } + + return elt; +} + +static void tracing_map_free_elts(struct tracing_map *map) +{ + unsigned int i; + + if (!map->elts) + return; + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + + tracing_map_array_free(map->elts); +} + +static int tracing_map_alloc_elts(struct tracing_map *map) +{ + unsigned int i; + + map->elts = tracing_map_array_alloc(map->max_elts, + sizeof(struct tracing_map_elt *)); + if (!map->elts) + return -ENOMEM; + + for (i = 0; i < map->max_elts; i++) { + *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); + if (!(*(TRACING_MAP_ELT(map->elts, i)))) { + tracing_map_free_elts(map); + + return -ENOMEM; + } + } + + return 0; +} + +static inline bool keys_match(void *key, void *test_key, unsigned key_size) +{ + bool match = true; + + if (memcmp(key, test_key, key_size)) + match = false; + + return match; +} + +static inline struct tracing_map_elt * +__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) +{ + u32 idx, key_hash, test_key; + struct tracing_map_entry *entry; + + key_hash = jhash(key, map->key_size, 0); + if (key_hash == 0) + key_hash = 1; + idx = key_hash >> (32 - (map->map_bits + 1)); + + while (1) { + idx &= (map->map_size - 1); + entry = TRACING_MAP_ENTRY(map->map, idx); + test_key = entry->key; + + if (test_key && test_key == key_hash && entry->val && + keys_match(key, entry->val->key, map->key_size)) { + atomic64_inc(&map->hits); + return entry->val; + } + + if (!test_key) { + if (lookup_only) + break; + + if (!cmpxchg(&entry->key, 0, key_hash)) { + struct tracing_map_elt *elt; + + elt = get_free_elt(map); + if (!elt) { + atomic64_inc(&map->drops); + entry->key = 0; + break; + } + + memcpy(elt->key, key, map->key_size); + entry->val = elt; + atomic64_inc(&map->hits); + + return entry->val; + } + } + + idx++; + } + + return NULL; +} + +/** + * tracing_map_insert - Insert key and/or retrieve val from a tracing_map + * @map: The tracing_map to insert into + * @key: The key to insert + * + * Inserts a key into a tracing_map and creates and returns a new + * tracing_map_elt for it, or if the key has already been inserted by + * a previous call, returns the tracing_map_elt already associated + * with it. When the map was created, the number of elements to be + * allocated for the map was specified (internally maintained as + * 'max_elts' in struct tracing_map), and that number of + * tracing_map_elts was created by tracing_map_init(). This is the + * pre-allocated pool of tracing_map_elts that tracing_map_insert() + * will allocate from when adding new keys. Once that pool is + * exhausted, tracing_map_insert() is useless and will return NULL to + * signal that state. There are two user-visible tracing_map + * variables, 'hits' and 'drops', which are updated by this function. + * Every time an element is either successfully inserted or retrieved, + * the 'hits' value is incrememented. Every time an element insertion + * fails, the 'drops' value is incremented. + * + * This is a lock-free tracing map insertion function implementing a + * modified form of Cliff Click's basic insertion algorithm. It + * requires the table size be a power of two. To prevent any + * possibility of an infinite loop we always make the internal table + * size double the size of the requested table size (max_elts * 2). + * Likewise, we never reuse a slot or resize or delete elements - when + * we've reached max_elts entries, we simply return NULL once we've + * run out of entries. Readers can at any point in time traverse the + * tracing map and safely access the key/val pairs. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If this was a newly inserted key, the val will be a newly allocated + * and associated tracing_map_elt pointer val. If the key wasn't + * found and the pool of tracing_map_elts has been exhausted, NULL is + * returned and no further insertions will succeed. + */ +struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, false); +} + +/** + * tracing_map_lookup - Retrieve val from a tracing_map + * @map: The tracing_map to perform the lookup on + * @key: The key to look up + * + * Looks up key in tracing_map and if found returns the matching + * tracing_map_elt. This is a lock-free lookup; see + * tracing_map_insert() for details on tracing_map and how it works. + * Every time an element is retrieved, the 'hits' value is + * incrememented. There is one user-visible tracing_map variable, + * 'hits', which is updated by this function. Every time an element + * is successfully retrieved, the 'hits' value is incrememented. The + * 'drops' value is never updated by this function. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If the key wasn't found, NULL is returned. + */ +struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, true); +} + +/** + * tracing_map_destroy - Destroy a tracing_map + * @map: The tracing_map to destroy + * + * Frees a tracing_map along with its associated array of + * tracing_map_elts. + * + * Callers should make sure there are no readers or writers actively + * reading or inserting into the map before calling this. + */ +void tracing_map_destroy(struct tracing_map *map) +{ + if (!map) + return; + + tracing_map_free_elts(map); + + tracing_map_array_free(map->map); + kfree(map); +} + +/** + * tracing_map_clear - Clear a tracing_map + * @map: The tracing_map to clear + * + * Resets the tracing map to a cleared or initial state. The + * tracing_map_elts are all cleared, and the array of struct + * tracing_map_entry is reset to an initialized state. + * + * Callers should make sure there are no writers actively inserting + * into the map before calling this. + */ +void tracing_map_clear(struct tracing_map *map) +{ + unsigned int i; + + atomic_set(&map->next_elt, -1); + atomic64_set(&map->hits, 0); + atomic64_set(&map->drops, 0); + + tracing_map_array_clear(map->map); + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i))); +} + +static void set_sort_key(struct tracing_map *map, + struct tracing_map_sort_key *sort_key) +{ + map->sort_key = *sort_key; +} + +/** + * tracing_map_create - Create a lock-free map and element pool + * @map_bits: The size of the map (2 ** map_bits) + * @key_size: The size of the key for the map in bytes + * @ops: Optional client-defined tracing_map_ops instance + * @private_data: Client data associated with the map + * + * Creates and sets up a map to contain 2 ** map_bits number of + * elements (internally maintained as 'max_elts' in struct + * tracing_map). Before using, map fields should be added to the map + * with tracing_map_add_sum_field() and tracing_map_add_key_field(). + * tracing_map_init() should then be called to allocate the array of + * tracing_map_elts, in order to avoid allocating anything in the map + * insertion path. The user-specified map size reflects the maximum + * number of elements that can be contained in the table requested by + * the user - internally we double that in order to keep the table + * sparse and keep collisions manageable. + * + * A tracing_map is a special-purpose map designed to aggregate or + * 'sum' one or more values associated with a specific object of type + * tracing_map_elt, which is attached by the map to a given key. + * + * tracing_map_create() sets up the map itself, and provides + * operations for inserting tracing_map_elts, but doesn't allocate the + * tracing_map_elts themselves, or provide a means for describing the + * keys or sums associated with the tracing_map_elts. All + * tracing_map_elts for a given map have the same set of sums and + * keys, which are defined by the client using the functions + * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once + * the fields are defined, the pool of elements allocated for the map + * can be created, which occurs when the client code calls + * tracing_map_init(). + * + * When tracing_map_init() returns, tracing_map_elt elements can be + * inserted into the map using tracing_map_insert(). When called, + * tracing_map_insert() grabs a free tracing_map_elt from the pool, or + * finds an existing match in the map and in either case returns it. + * The client can then use tracing_map_update_sum() and + * tracing_map_read_sum() to update or read a given sum field for the + * tracing_map_elt. + * + * The client can at any point retrieve and traverse the current set + * of inserted tracing_map_elts in a tracing_map, via + * tracing_map_sort_entries(). Sorting can be done on any field, + * including keys. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +struct tracing_map *tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data) +{ + struct tracing_map *map; + unsigned int i; + + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + return ERR_PTR(-EINVAL); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map_bits = map_bits; + map->max_elts = (1 << map_bits); + atomic_set(&map->next_elt, -1); + + map->map_size = (1 << (map_bits + 1)); + map->ops = ops; + + map->private_data = private_data; + + map->map = tracing_map_array_alloc(map->map_size, + sizeof(struct tracing_map_entry)); + if (!map->map) + goto free; + + map->key_size = key_size; + for (i = 0; i < TRACING_MAP_KEYS_MAX; i++) + map->key_idx[i] = -1; + out: + return map; + free: + tracing_map_destroy(map); + map = ERR_PTR(-ENOMEM); + + goto out; +} + +/** + * tracing_map_init - Allocate and clear a map's tracing_map_elts + * @map: The tracing_map to initialize + * + * Allocates a clears a pool of tracing_map_elts equal to the + * user-specified size of 2 ** map_bits (internally maintained as + * 'max_elts' in struct tracing_map). Before using, the map fields + * should be added to the map with tracing_map_add_sum_field() and + * tracing_map_add_key_field(). tracing_map_init() should then be + * called to allocate the array of tracing_map_elts, in order to avoid + * allocating anything in the map insertion path. The user-specified + * map size reflects the max number of elements requested by the user + * - internally we double that in order to keep the table sparse and + * keep collisions manageable. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +int tracing_map_init(struct tracing_map *map) +{ + int err; + + if (map->n_fields < 2) + return -EINVAL; /* need at least 1 key and 1 val */ + + err = tracing_map_alloc_elts(map); + if (err) + return err; + + tracing_map_clear(map); + + return err; +} + +static int cmp_entries_dup(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + int ret = 0; + + if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + ret = 1; + + return ret; +} + +static int cmp_entries_sum(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + cmp_fn = field->cmp_fn; + + val_a = &elt_a->fields[sort_key->field_idx].sum; + val_b = &elt_b->fields[sort_key->field_idx].sum; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static int cmp_entries_key(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + + cmp_fn = field->cmp_fn; + + val_a = elt_a->key + field->offset; + val_b = elt_b->key + field->offset; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static void destroy_sort_entry(struct tracing_map_sort_entry *entry) +{ + if (!entry) + return; + + if (entry->elt_copied) + tracing_map_elt_free(entry->elt); + + kfree(entry); +} + +/** + * tracing_map_destroy_sort_entries - Destroy an array of sort entries + * @entries: The entries to destroy + * @n_entries: The number of entries in the array + * + * Destroy the elements returned by a tracing_map_sort_entries() call. + */ +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries) +{ + unsigned int i; + + for (i = 0; i < n_entries; i++) + destroy_sort_entry(entries[i]); + + vfree(entries); +} + +static struct tracing_map_sort_entry * +create_sort_entry(void *key, struct tracing_map_elt *elt) +{ + struct tracing_map_sort_entry *sort_entry; + + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + if (!sort_entry) + return NULL; + + sort_entry->key = key; + sort_entry->elt = elt; + + return sort_entry; +} + +static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) +{ + struct tracing_map_elt *dup_elt; + unsigned int i; + + dup_elt = tracing_map_elt_alloc(elt->map); + if (!dup_elt) + return NULL; + + if (elt->map->ops && elt->map->ops->elt_copy) + elt->map->ops->elt_copy(dup_elt, elt); + + dup_elt->private_data = elt->private_data; + memcpy(dup_elt->key, elt->key, elt->map->key_size); + + for (i = 0; i < elt->map->n_fields; i++) { + atomic64_set(&dup_elt->fields[i].sum, + atomic64_read(&elt->fields[i].sum)); + dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; + } + + return dup_elt; +} + +static int merge_dup(struct tracing_map_sort_entry **sort_entries, + unsigned int target, unsigned int dup) +{ + struct tracing_map_elt *target_elt, *elt; + bool first_dup = (target - dup) == 1; + int i; + + if (first_dup) { + elt = sort_entries[target]->elt; + target_elt = copy_elt(elt); + if (!target_elt) + return -ENOMEM; + sort_entries[target]->elt = target_elt; + sort_entries[target]->elt_copied = true; + } else + target_elt = sort_entries[target]->elt; + + elt = sort_entries[dup]->elt; + + for (i = 0; i < elt->map->n_fields; i++) + atomic64_add(atomic64_read(&elt->fields[i].sum), + &target_elt->fields[i].sum); + + sort_entries[dup]->dup = true; + + return 0; +} + +static int merge_dups(struct tracing_map_sort_entry **sort_entries, + int n_entries, unsigned int key_size) +{ + unsigned int dups = 0, total_dups = 0; + int err, i, j; + void *key; + + if (n_entries < 2) + return total_dups; + + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_dup, NULL); + + key = sort_entries[0]->key; + for (i = 1; i < n_entries; i++) { + if (!memcmp(sort_entries[i]->key, key, key_size)) { + dups++; total_dups++; + err = merge_dup(sort_entries, i - dups, i); + if (err) + return err; + continue; + } + key = sort_entries[i]->key; + dups = 0; + } + + if (!total_dups) + return total_dups; + + for (i = 0, j = 0; i < n_entries; i++) { + if (!sort_entries[i]->dup) { + sort_entries[j] = sort_entries[i]; + if (j++ != i) + sort_entries[i] = NULL; + } else { + destroy_sort_entry(sort_entries[i]); + sort_entries[i] = NULL; + } + } + + return total_dups; +} + +static bool is_key(struct tracing_map *map, unsigned int field_idx) +{ + unsigned int i; + + for (i = 0; i < map->n_keys; i++) + if (map->key_idx[i] == field_idx) + return true; + return false; +} + +static void sort_secondary(struct tracing_map *map, + const struct tracing_map_sort_entry **entries, + unsigned int n_entries, + struct tracing_map_sort_key *primary_key, + struct tracing_map_sort_key *secondary_key) +{ + int (*primary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + int (*secondary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + unsigned i, start = 0, n_sub = 1; + + if (is_key(map, primary_key->field_idx)) + primary_fn = cmp_entries_key; + else + primary_fn = cmp_entries_sum; + + if (is_key(map, secondary_key->field_idx)) + secondary_fn = cmp_entries_key; + else + secondary_fn = cmp_entries_sum; + + for (i = 0; i < n_entries - 1; i++) { + const struct tracing_map_sort_entry **a = &entries[i]; + const struct tracing_map_sort_entry **b = &entries[i + 1]; + + if (primary_fn(a, b) == 0) { + n_sub++; + if (i < n_entries - 2) + continue; + } + + if (n_sub < 2) { + start = i + 1; + n_sub = 1; + continue; + } + + set_sort_key(map, secondary_key); + sort(&entries[start], n_sub, + sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))secondary_fn, NULL); + set_sort_key(map, primary_key); + + start = i + 1; + n_sub = 1; + } +} + +/** + * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map + * @map: The tracing_map + * @sort_key: The sort key to use for sorting + * @sort_entries: outval: pointer to allocated and sorted array of entries + * + * tracing_map_sort_entries() sorts the current set of entries in the + * map and returns the list of tracing_map_sort_entries containing + * them to the client in the sort_entries param. The client can + * access the struct tracing_map_elt element of interest directly as + * the 'elt' field of a returned struct tracing_map_sort_entry object. + * + * The sort_key has only two fields: idx and descending. 'idx' refers + * to the index of the field added via tracing_map_add_sum_field() or + * tracing_map_add_key_field() when the tracing_map was initialized. + * 'descending' is a flag that if set reverses the sort order, which + * by default is ascending. + * + * The client should not hold on to the returned array but should use + * it and call tracing_map_destroy_sort_entries() when done. + * + * Return: the number of sort_entries in the struct tracing_map_sort_entry + * array, negative on error + */ +int tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries) +{ + int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + struct tracing_map_sort_entry *sort_entry, **entries; + int i, n_entries, ret; + + entries = vmalloc(map->max_elts * sizeof(sort_entry)); + if (!entries) + return -ENOMEM; + + for (i = 0, n_entries = 0; i < map->map_size; i++) { + struct tracing_map_entry *entry; + + entry = TRACING_MAP_ENTRY(map->map, i); + + if (!entry->key || !entry->val) + continue; + + entries[n_entries] = create_sort_entry(entry->val->key, + entry->val); + if (!entries[n_entries++]) { + ret = -ENOMEM; + goto free; + } + } + + if (n_entries == 0) { + ret = 0; + goto free; + } + + if (n_entries == 1) { + *sort_entries = entries; + return 1; + } + + ret = merge_dups(entries, n_entries, map->key_size); + if (ret < 0) + goto free; + n_entries -= ret; + + if (is_key(map, sort_keys[0].field_idx)) + cmp_entries_fn = cmp_entries_key; + else + cmp_entries_fn = cmp_entries_sum; + + set_sort_key(map, &sort_keys[0]); + + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_fn, NULL); + + if (n_sort_keys > 1) + sort_secondary(map, + (const struct tracing_map_sort_entry **)entries, + n_entries, + &sort_keys[0], + &sort_keys[1]); + + *sort_entries = entries; + + return n_entries; + free: + tracing_map_destroy_sort_entries(entries, n_entries); + + return ret; +} diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h new file mode 100644 index 000000000000..75718255a8ad --- /dev/null +++ b/kernel/trace/tracing_map.h @@ -0,0 +1,282 @@ +#ifndef __TRACING_MAP_H +#define __TRACING_MAP_H + +#define TRACING_MAP_BITS_DEFAULT 11 +#define TRACING_MAP_BITS_MAX 17 +#define TRACING_MAP_BITS_MIN 7 + +#define TRACING_MAP_FIELDS_MAX 4 +#define TRACING_MAP_KEYS_MAX 2 + +#define TRACING_MAP_SORT_KEYS_MAX 2 + +typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); + +/* + * This is an overview of the tracing_map data structures and how they + * relate to the tracing_map API. The details of the algorithms + * aren't discussed here - this is just a general overview of the data + * structures and how they interact with the API. + * + * The central data structure of the tracing_map is an initially + * zeroed array of struct tracing_map_entry (stored in the map field + * of struct tracing_map). tracing_map_entry is a very simple data + * structure containing only two fields: a 32-bit unsigned 'key' + * variable and a pointer named 'val'. This array of struct + * tracing_map_entry is essentially a hash table which will be + * modified by a single function, tracing_map_insert(), but which can + * be traversed and read by a user at any time (though the user does + * this indirectly via an array of tracing_map_sort_entry - see the + * explanation of that data structure in the discussion of the + * sorting-related data structures below). + * + * The central function of the tracing_map API is + * tracing_map_insert(). tracing_map_insert() hashes the + * arbitrarily-sized key passed into it into a 32-bit unsigned key. + * It then uses this key, truncated to the array size, as an index + * into the array of tracing_map_entries. If the value of the 'key' + * field of the tracing_map_entry found at that location is 0, then + * that entry is considered to be free and can be claimed, by + * replacing the 0 in the 'key' field of the tracing_map_entry with + * the new 32-bit hashed key. Once claimed, that tracing_map_entry's + * 'val' field is then used to store a unique element which will be + * forever associated with that 32-bit hashed key in the + * tracing_map_entry. + * + * That unique element now in the tracing_map_entry's 'val' field is + * an instance of tracing_map_elt, where 'elt' in the latter part of + * that variable name is short for 'element'. The purpose of a + * tracing_map_elt is to hold values specific to the the particular + * 32-bit hashed key it's assocated with. Things such as the unique + * set of aggregated sums associated with the 32-bit hashed key, along + * with a copy of the full key associated with the entry, and which + * was used to produce the 32-bit hashed key. + * + * When tracing_map_create() is called to create the tracing map, the + * user specifies (indirectly via the map_bits param, the details are + * unimportant for this discussion) the maximum number of elements + * that the map can hold (stored in the max_elts field of struct + * tracing_map). This is the maximum possible number of + * tracing_map_entries in the tracing_map_entry array which can be + * 'claimed' as described in the above discussion, and therefore is + * also the maximum number of tracing_map_elts that can be associated + * with the tracing_map_entry array in the tracing_map. Because of + * the way the insertion algorithm works, the size of the allocated + * tracing_map_entry array is always twice the maximum number of + * elements (2 * max_elts). This value is stored in the map_size + * field of struct tracing_map. + * + * Because tracing_map_insert() needs to work from any context, + * including from within the memory allocation functions themselves, + * both the tracing_map_entry array and a pool of max_elts + * tracing_map_elts are pre-allocated before any call is made to + * tracing_map_insert(). + * + * The tracing_map_entry array is allocated as a single block by + * tracing_map_create(). + * + * Because the tracing_map_elts are much larger objects and can't + * generally be allocated together as a single large array without + * failure, they're allocated individually, by tracing_map_init(). + * + * The pool of tracing_map_elts are allocated by tracing_map_init() + * rather than by tracing_map_create() because at the time + * tracing_map_create() is called, there isn't enough information to + * create the tracing_map_elts. Specifically,the user first needs to + * tell the tracing_map implementation how many fields the + * tracing_map_elts contain, and which types of fields they are (key + * or sum). The user does this via the tracing_map_add_sum_field() + * and tracing_map_add_key_field() functions, following which the user + * calls tracing_map_init() to finish up the tracing map setup. The + * array holding the pointers which make up the pre-allocated pool of + * tracing_map_elts is allocated as a single block and is stored in + * the elts field of struct tracing_map. + * + * There is also a set of structures used for sorting that might + * benefit from some minimal explanation. + * + * struct tracing_map_sort_key is used to drive the sort at any given + * time. By 'any given time' we mean that a different + * tracing_map_sort_key will be used at different times depending on + * whether the sort currently being performed is a primary or a + * secondary sort. + * + * The sort key is very simple, consisting of the field index of the + * tracing_map_elt field to sort on (which the user saved when adding + * the field), and whether the sort should be done in an ascending or + * descending order. + * + * For the convenience of the sorting code, a tracing_map_sort_entry + * is created for each tracing_map_elt, again individually allocated + * to avoid failures that might be expected if allocated as a single + * large array of struct tracing_map_sort_entry. + * tracing_map_sort_entry instances are the objects expected by the + * various internal sorting functions, and are also what the user + * ultimately receives after calling tracing_map_sort_entries(). + * Because it doesn't make sense for users to access an unordered and + * sparsely populated tracing_map directly, the + * tracing_map_sort_entries() function is provided so that users can + * retrieve a sorted list of all existing elements. In addition to + * the associated tracing_map_elt 'elt' field contained within the + * tracing_map_sort_entry, which is the object of interest to the + * user, tracing_map_sort_entry objects contain a number of additional + * fields which are used for caching and internal purposes and can + * safely be ignored. +*/ + +struct tracing_map_field { + tracing_map_cmp_fn_t cmp_fn; + union { + atomic64_t sum; + unsigned int offset; + }; +}; + +struct tracing_map_elt { + struct tracing_map *map; + struct tracing_map_field *fields; + void *key; + void *private_data; +}; + +struct tracing_map_entry { + u32 key; + struct tracing_map_elt *val; +}; + +struct tracing_map_sort_key { + unsigned int field_idx; + bool descending; +}; + +struct tracing_map_sort_entry { + void *key; + struct tracing_map_elt *elt; + bool elt_copied; + bool dup; +}; + +struct tracing_map_array { + unsigned int entries_per_page; + unsigned int entry_size_shift; + unsigned int entry_shift; + unsigned int entry_mask; + unsigned int n_pages; + void **pages; +}; + +#define TRACING_MAP_ARRAY_ELT(array, idx) \ + (array->pages[idx >> array->entry_shift] + \ + ((idx & array->entry_mask) << array->entry_size_shift)) + +#define TRACING_MAP_ENTRY(array, idx) \ + ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx)) + +#define TRACING_MAP_ELT(array, idx) \ + ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx)) + +struct tracing_map { + unsigned int key_size; + unsigned int map_bits; + unsigned int map_size; + unsigned int max_elts; + atomic_t next_elt; + struct tracing_map_array *elts; + struct tracing_map_array *map; + const struct tracing_map_ops *ops; + void *private_data; + struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_fields; + int key_idx[TRACING_MAP_KEYS_MAX]; + unsigned int n_keys; + struct tracing_map_sort_key sort_key; + atomic64_t hits; + atomic64_t drops; +}; + +/** + * struct tracing_map_ops - callbacks for tracing_map + * + * The methods in this structure define callback functions for various + * operations on a tracing_map or objects related to a tracing_map. + * + * For a detailed description of tracing_map_elt objects please see + * the overview of tracing_map data structures at the beginning of + * this file. + * + * All the methods below are optional. + * + * @elt_alloc: When a tracing_map_elt is allocated, this function, if + * defined, will be called and gives clients the opportunity to + * allocate additional data and attach it to the element + * (tracing_map_elt->private_data is meant for that purpose). + * Element allocation occurs before tracing begins, when the + * tracing_map_init() call is made by client code. + * + * @elt_copy: At certain points in the lifetime of an element, it may + * need to be copied. The copy should include a copy of the + * client-allocated data, which can be copied into the 'to' + * element from the 'from' element. + * + * @elt_free: When a tracing_map_elt is freed, this function is called + * and allows client-allocated per-element data to be freed. + * + * @elt_clear: This callback allows per-element client-defined data to + * be cleared, if applicable. + * + * @elt_init: This callback allows per-element client-defined data to + * be initialized when used i.e. when the element is actually + * claimed by tracing_map_insert() in the context of the map + * insertion. + */ +struct tracing_map_ops { + int (*elt_alloc)(struct tracing_map_elt *elt); + void (*elt_copy)(struct tracing_map_elt *to, + struct tracing_map_elt *from); + void (*elt_free)(struct tracing_map_elt *elt); + void (*elt_clear)(struct tracing_map_elt *elt); + void (*elt_init)(struct tracing_map_elt *elt); +}; + +extern struct tracing_map * +tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data); +extern int tracing_map_init(struct tracing_map *map); + +extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn); + +extern void tracing_map_destroy(struct tracing_map *map); +extern void tracing_map_clear(struct tracing_map *map); + +extern struct tracing_map_elt * +tracing_map_insert(struct tracing_map *map, void *key); +extern struct tracing_map_elt * +tracing_map_lookup(struct tracing_map *map, void *key); + +extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed); +extern int tracing_map_cmp_string(void *val_a, void *val_b); +extern int tracing_map_cmp_none(void *val_a, void *val_b); + +extern void tracing_map_update_sum(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern void tracing_map_set_field_descr(struct tracing_map *map, + unsigned int i, + unsigned int key_offset, + tracing_map_cmp_fn_t cmp_fn); +extern int +tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries); + +extern void +tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries); +#endif /* __TRACING_MAP_H */ -- cgit v1.2.3 From 8d44f2f34fb50f40185ef6404b0047223a27ba82 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 19 Feb 2016 14:47:00 -0500 Subject: tracing: Fix TRACING_MAP Kconfig The config option for TRACING_MAP has "default n", which is not needed because the default of configs is 'n'. Also, since the TRACING_MAP has no config prompt, there's no reason to include "If in doubt, say N" in the help text. Fixed a typo in the comments of tracing_map.h. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 3 --- kernel/trace/tracing_map.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 48b732943e0e..d39556fd863a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -531,7 +531,6 @@ config MMIOTRACE config TRACING_MAP bool depends on ARCH_HAVE_NMI_SAFE_CMPXCHG - default n help tracing_map is a special-purpose lock-free map for tracing, separated out as a stand-alone facility in order to allow it @@ -539,8 +538,6 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. - If in doubt, say N. - config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 75718255a8ad..1f7eda548787 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -46,7 +46,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); * That unique element now in the tracing_map_entry's 'val' field is * an instance of tracing_map_elt, where 'elt' in the latter part of * that variable name is short for 'element'. The purpose of a - * tracing_map_elt is to hold values specific to the the particular + * tracing_map_elt is to hold values specific to the particular * 32-bit hashed key it's assocated with. Things such as the unique * set of aggregated sums associated with the 32-bit hashed key, along * with a copy of the full key associated with the entry, and which -- cgit v1.2.3 From 3b772b96b8338bca2532839b2cd7802800e66037 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:41 -0600 Subject: tracing: Update some tracing_map constants and comments Make it clear exactly how many keys and values are supported through better defines, and add 1 to the vals count, since normally clients want support for at least a hitcount and two other values. Also, note the error return value for tracing_map_add_key/val_field() in the comments. Link: http://lkml.kernel.org/r/6696fa02ebc716aa344c27a571a2afaa25e5b4d4.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 4 ++-- kernel/trace/tracing_map.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index dd6401dd145e..e0f172932eca 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -163,7 +163,7 @@ static int tracing_map_add_field(struct tracing_map *map, * tracing_map_update_sum() or reading it via tracing_map_read_sum(). * * Return: The index identifying the field in the map and associated - * tracing_map_elts. + * tracing_map_elts, or -EINVAL on error. */ int tracing_map_add_sum_field(struct tracing_map *map) { @@ -184,7 +184,7 @@ int tracing_map_add_sum_field(struct tracing_map *map) * the key referenced by this key field resides. * * Return: The index identifying the field in the map and associated - * tracing_map_elts. + * tracing_map_elts, or -EINVAL on error. */ int tracing_map_add_key_field(struct tracing_map *map, unsigned int offset, diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 1f7eda548787..618838f5f30a 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -5,9 +5,10 @@ #define TRACING_MAP_BITS_MAX 17 #define TRACING_MAP_BITS_MIN 7 -#define TRACING_MAP_FIELDS_MAX 4 #define TRACING_MAP_KEYS_MAX 2 - +#define TRACING_MAP_VALS_MAX 3 +#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ + TRACING_MAP_VALS_MAX) #define TRACING_MAP_SORT_KEYS_MAX 2 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); -- cgit v1.2.3 From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:42 -0600 Subject: tracing: Add 'hist' event trigger command 'hist' triggers allow users to continually aggregate trace events, which can then be viewed afterwards by simply reading a 'hist' file containing the aggregation in a human-readable format. The basic idea is very simple and boils down to a mechanism whereby trace events, rather than being exhaustively dumped in raw form and viewed directly, are automatically 'compressed' into meaningful tables completely defined by the user. This is done strictly via single-line command-line commands and without the aid of any kind of programming language or interpreter. A surprising number of typical use cases can be accomplished by users via this simple mechanism. In fact, a large number of the tasks that users typically do using the more complicated script-based tracing tools, at least during the initial stages of an investigation, can be accomplished by simply specifying a set of keys and values to be used in the creation of a hash table. The Linux kernel trace event subsystem happens to provide an extensive list of keys and values ready-made for such a purpose in the form of the event format files associated with each trace event. By simply consulting the format file for field names of interest and by plugging them into the hist trigger command, users can create an endless number of useful aggregations to help with investigating various properties of the system. See Documentation/trace/events.txt for examples. hist triggers are implemented on top of the existing event trigger infrastructure, and as such are consistent with the existing triggers from a user's perspective as well. The basic syntax follows the existing trigger syntax. Users start an aggregation by writing a 'hist' trigger to the event of interest's trigger file: # echo hist:keys=xxx [ if filter] > event/trigger Once a hist trigger has been set up, by default it continually aggregates every matching event into a hash table using the event key and a value field named 'hitcount'. To view the aggregation at any point in time, simply read the 'hist' file in the same directory as the 'trigger' file: # cat event/hist The detailed syntax provides additional options for user control, and is described exhaustively in Documentation/trace/events.txt and in the virtual tracing/README file in the tracing subsystem. Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 17 + kernel/trace/trace.h | 7 + kernel/trace/trace_events.c | 4 + kernel/trace/trace_events_hist.c | 849 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 1 + 7 files changed, 895 insertions(+) create mode 100644 kernel/trace/trace_events_hist.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d39556fd863a..fafeaf803bd0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -538,6 +538,22 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config HIST_TRIGGERS + bool "Histogram triggers" + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + select TRACING_MAP + default n + help + Hist triggers allow one or more arbitrary trace event fields + to be aggregated into hash tables and dumped to stdout by + reading a debugfs/tracefs file. They're useful for + gathering quick and dirty (though precise) summaries of + event activity as an initial guide for further investigation + using more advanced tools. + + See Documentation/trace/events.txt. + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 4255c4057aaa..979e7bfbde7a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0d12dbde8399..6cf8fd03b028 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3812,6 +3812,9 @@ static const char readme_msg[] = #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" #endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" @@ -3828,6 +3831,20 @@ static const char readme_msg[] = "\t To remove a trigger with a count:\n" "\t echo '!:0 > //trigger\n" "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=\n" + "\t [:size=#entries]\n" + "\t [if ]\n\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key named, and the value of a sum called\n" + "\t 'hitcount' is incremented. Keys correspond to fields in the\n" + "\t event's format description. Keys can be any field. The\n" + "\t 'size' parameter can be used to specify more or fewer than\n" + "\t the default 2048 entries for the hashtable size.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout." +#endif ; static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2525042760e6..505f8a45f426 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1162,6 +1162,13 @@ extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; +extern const struct file_operations event_hist_fops; + +#ifdef CONFIG_HIST_TRIGGERS +extern int register_trigger_hist_cmd(void); +#else +static inline int register_trigger_hist_cmd(void) { return 0; } +#endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index add81dff7520..e7cb983ee93c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); +#ifdef CONFIG_HIST_TRIGGERS + trace_create_file("hist", 0444, file->dir, file, + &event_hist_fops); +#endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c new file mode 100644 index 000000000000..23b45e462117 --- /dev/null +++ b/kernel/trace/trace_events_hist.c @@ -0,0 +1,849 @@ +/* + * trace_events_hist - trace event hist triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi + */ + +#include +#include +#include +#include +#include + +#include "tracing_map.h" +#include "trace.h" + +struct hist_field; + +typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); + +struct hist_field { + struct ftrace_event_field *field; + unsigned long flags; + hist_field_fn_t fn; + unsigned int size; +}; + +static u64 hist_field_counter(struct hist_field *field, void *event) +{ + return 1; +} + +static u64 hist_field_string(struct hist_field *hist_field, void *event) +{ + char *addr = (char *)(event + hist_field->field->offset); + + return (u64)(unsigned long)addr; +} + +#define DEFINE_HIST_FIELD_FN(type) \ +static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ +{ \ + type *addr = (type *)(event + hist_field->field->offset); \ + \ + return (u64)*addr; \ +} + +DEFINE_HIST_FIELD_FN(s64); +DEFINE_HIST_FIELD_FN(u64); +DEFINE_HIST_FIELD_FN(s32); +DEFINE_HIST_FIELD_FN(u32); +DEFINE_HIST_FIELD_FN(s16); +DEFINE_HIST_FIELD_FN(u16); +DEFINE_HIST_FIELD_FN(s8); +DEFINE_HIST_FIELD_FN(u8); + +#define for_each_hist_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_fields; (i)++) + +#define for_each_hist_val_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_vals; (i)++) + +#define for_each_hist_key_field(i, hist_data) \ + for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) + +#define HITCOUNT_IDX 0 +#define HIST_KEY_MAX 1 +#define HIST_KEY_SIZE_MAX MAX_FILTER_STR_VAL + +enum hist_field_flags { + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, +}; + +struct hist_trigger_attrs { + char *keys_str; + unsigned int map_bits; +}; + +struct hist_trigger_data { + struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_vals; + unsigned int n_keys; + unsigned int n_fields; + unsigned int key_size; + struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; + unsigned int n_sort_keys; + struct trace_event_file *event_file; + struct hist_trigger_attrs *attrs; + struct tracing_map *map; +}; + +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +{ + hist_field_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = hist_field_s64; + else + fn = hist_field_u64; + break; + case 4: + if (field_is_signed) + fn = hist_field_s32; + else + fn = hist_field_u32; + break; + case 2: + if (field_is_signed) + fn = hist_field_s16; + else + fn = hist_field_u16; + break; + case 1: + if (field_is_signed) + fn = hist_field_s8; + else + fn = hist_field_u8; + break; + } + + return fn; +} + +static int parse_map_size(char *str) +{ + unsigned long size, map_bits; + int ret; + + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + ret = kstrtoul(str, 0, &size); + if (ret) + goto out; + + map_bits = ilog2(roundup_pow_of_two(size)); + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + ret = -EINVAL; + else + ret = map_bits; + out: + return ret; +} + +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) +{ + if (!attrs) + return; + + kfree(attrs->keys_str); + kfree(attrs); +} + +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +{ + struct hist_trigger_attrs *attrs; + int ret = 0; + + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + while (trigger_str) { + char *str = strsep(&trigger_str, ":"); + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) + attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto free; + } + attrs->map_bits = map_bits; + } else { + ret = -EINVAL; + goto free; + } + } + + if (!attrs->keys_str) { + ret = -EINVAL; + goto free; + } + + return attrs; + free: + destroy_hist_trigger_attrs(attrs); + + return ERR_PTR(ret); +} + +static void destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field); +} + +static struct hist_field *create_hist_field(struct ftrace_event_field *field, + unsigned long flags) +{ + struct hist_field *hist_field; + + if (field && is_function_field(field)) + return NULL; + + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!hist_field) + return NULL; + + if (flags & HIST_FIELD_FL_HITCOUNT) { + hist_field->fn = hist_field_counter; + goto out; + } + + if (is_string_field(field)) { + flags |= HIST_FIELD_FL_STRING; + hist_field->fn = hist_field_string; + } else { + hist_field->fn = select_value_fn(field->size, + field->is_signed); + if (!hist_field->fn) { + destroy_hist_field(hist_field); + return NULL; + } + } + out: + hist_field->field = field; + hist_field->flags = flags; + + return hist_field; +} + +static void destroy_hist_fields(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + if (hist_data->fields[i]) { + destroy_hist_field(hist_data->fields[i]); + hist_data->fields[i] = NULL; + } + } +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int create_val_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_hitcount_val(hist_data); + + return ret; +} + +static int create_key_field(struct hist_trigger_data *hist_data, + unsigned int key_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + unsigned int key_size; + int ret = 0; + + if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + flags |= HIST_FIELD_FL_KEY; + + field = trace_find_event_field(file->event_call, field_str); + if (!field) { + ret = -EINVAL; + goto out; + } + + key_size = field->size; + + hist_data->fields[key_idx] = create_hist_field(field, flags); + if (!hist_data->fields[key_idx]) { + ret = -ENOMEM; + goto out; + } + + key_size = ALIGN(key_size, sizeof(u64)); + hist_data->fields[key_idx]->size = key_size; + hist_data->key_size = key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { + ret = -EINVAL; + goto out; + } + + hist_data->n_keys++; + + if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) + return -EINVAL; + + ret = key_size; + out: + return ret; +} + +static int create_key_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, n_vals = hist_data->n_vals; + char *fields_str, *field_str; + int ret = -EINVAL; + + fields_str = hist_data->attrs->keys_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + ret = create_key_field(hist_data, i, file, field_str); + if (ret < 0) + goto out; + } + if (fields_str) { + ret = -EINVAL; + goto out; + } + ret = 0; + out: + return ret; +} + +static int create_hist_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_val_fields(hist_data, file); + if (ret) + goto out; + + ret = create_key_fields(hist_data, file); + if (ret) + goto out; + + hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + out: + return ret; +} + +static int create_sort_keys(struct hist_trigger_data *hist_data) +{ + int ret = 0; + + hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */ + + return ret; +} + +static void destroy_hist_data(struct hist_trigger_data *hist_data) +{ + destroy_hist_trigger_attrs(hist_data->attrs); + destroy_hist_fields(hist_data); + tracing_map_destroy(hist_data->map); + kfree(hist_data); +} + +static int create_tracing_map_fields(struct hist_trigger_data *hist_data) +{ + struct tracing_map *map = hist_data->map; + struct ftrace_event_field *field; + struct hist_field *hist_field; + unsigned int i, idx; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_KEY) { + tracing_map_cmp_fn_t cmp_fn; + + field = hist_field->field; + + if (is_string_field(field)) + cmp_fn = tracing_map_cmp_string; + else + cmp_fn = tracing_map_cmp_num(field->size, + field->is_signed); + idx = tracing_map_add_key_field(map, 0, cmp_fn); + } else + idx = tracing_map_add_sum_field(map); + + if (idx < 0) + return idx; + } + + return 0; +} + +static struct hist_trigger_data * +create_hist_data(unsigned int map_bits, + struct hist_trigger_attrs *attrs, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + int ret = 0; + + hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + if (!hist_data) + return ERR_PTR(-ENOMEM); + + hist_data->attrs = attrs; + + ret = create_hist_fields(hist_data, file); + if (ret) + goto free; + + ret = create_sort_keys(hist_data); + if (ret) + goto free; + + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, + NULL, hist_data); + if (IS_ERR(hist_data->map)) { + ret = PTR_ERR(hist_data->map); + hist_data->map = NULL; + goto free; + } + + ret = create_tracing_map_fields(hist_data); + if (ret) + goto free; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto free; + + hist_data->event_file = file; + out: + return hist_data; + free: + hist_data->attrs = NULL; + + destroy_hist_data(hist_data); + + hist_data = ERR_PTR(ret); + + goto out; +} + +static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + void *rec) +{ + struct hist_field *hist_field; + unsigned int i; + u64 hist_val; + + for_each_hist_val_field(i, hist_data) { + hist_field = hist_data->fields[i]; + hist_val = hist_field->fn(hist_field, rec); + tracing_map_update_sum(elt, i, hist_val); + } +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + struct tracing_map_elt *elt; + u64 field_contents; + void *key = NULL; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) + key = (void *)(unsigned long)field_contents; + else + key = (void *)&field_contents; + } + + elt = tracing_map_insert(hist_data->map, key); + if (elt) + hist_trigger_elt_update(hist_data, elt, rec); +} + +static void +hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, void *key, + struct tracing_map_elt *elt) +{ + struct hist_field *key_field; + unsigned int i; + u64 uval; + + seq_puts(m, "{ "); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ", "); + + if (key_field->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, "%s: %-50s", key_field->field->name, + (char *)key); + } else { + uval = *(u64 *)key; + seq_printf(m, "%s: %10llu", + key_field->field->name, uval); + } + } + + seq_puts(m, " }"); + + seq_printf(m, " hitcount: %10llu", + tracing_map_read_sum(elt, HITCOUNT_IDX)); + + seq_puts(m, "\n"); +} + +static int print_entries(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + unsigned int i, n_entries; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, + &sort_entries); + if (n_entries < 0) + return n_entries; + + for (i = 0; i < n_entries; i++) + hist_trigger_entry_print(m, hist_data, + sort_entries[i]->key, + sort_entries[i]->elt); + + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *test, *data = NULL; + struct trace_event_file *event_file; + struct hist_trigger_data *hist_data; + int n_entries, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(test, &event_file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + data = test; + break; + } + } + if (!data) + goto out_unlock; + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "\n"); + + hist_data = data->private_data; + n_entries = print_entries(m, hist_data); + if (n_entries < 0) { + ret = n_entries; + n_entries = 0; + } + + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", + (u64)atomic64_read(&hist_data->map->hits), + n_entries, (u64)atomic64_read(&hist_data->map->drops)); + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_open(struct inode *inode, struct file *file) +{ + return single_open(file, hist_show, file); +} + +const struct file_operations event_hist_fops = { + .open = event_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) +{ + seq_printf(m, "%s", hist_field->field->name); +} + +static int event_hist_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + unsigned int i; + + seq_puts(m, "hist:keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ","); + + hist_field_print(m, key_field); + } + + seq_puts(m, ":vals="); + seq_puts(m, "hitcount"); + + seq_puts(m, ":sort="); + seq_puts(m, "hitcount"); + + seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + + if (data->filter_str) + seq_printf(m, " if %s", data->filter_str); + + seq_puts(m, " [active]"); + + seq_putc(m, '\n'); + + return 0; +} + +static void event_hist_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + trigger_data_free(data); + destroy_hist_data(hist_data); + } +} + +static struct event_trigger_ops event_hist_trigger_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_trigger_init, + .free = event_hist_trigger_free, +}; + +static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) +{ + return &event_hist_trigger_ops; +} + +static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + update_cond_flag(file); + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + ret--; + } + out: + return ret; +} + +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; + struct event_trigger_data *trigger_data; + struct hist_trigger_attrs *attrs; + struct event_trigger_ops *trigger_ops; + struct hist_trigger_data *hist_data; + char *trigger; + int ret = 0; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (k:v [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + attrs = parse_hist_trigger_attrs(trigger); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + + if (attrs->map_bits) + hist_trigger_bits = attrs->map_bits; + + hist_data = create_hist_data(hist_trigger_bits, attrs, file); + if (IS_ERR(hist_data)) { + destroy_hist_trigger_attrs(attrs); + return PTR_ERR(hist_data); + } + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out_free; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + trigger_data->private_data = hist_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + ret = 0; + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of triggers registered, + * but if it didn't register any it returns zero. Consider no + * triggers registered a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + /* Just return zero, not the number of registered triggers */ + ret = 0; + out: + return ret; + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + + kfree(trigger_data); + + destroy_hist_data(hist_data); + goto out; +} + +static struct event_command trigger_hist_cmd = { + .name = "hist", + .trigger_type = ETT_EVENT_HIST, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = event_hist_trigger_func, + .reg = hist_register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = event_hist_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +__init int register_trigger_hist_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_cmd); + WARN_ON(ret < 0); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d67992f3bb0e..d29092afe005 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_cmd(); return 0; } -- cgit v1.2.3 From f2606835d70d2a2e6a134f01821da8149e124796 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:43 -0600 Subject: tracing: Add hist trigger support for multiple values ('vals=' param) Allow users to specify trace event fields to use in aggregated sums via a new 'vals=' keyword. Before this addition, the only aggregated sum supported was the implied value 'hitcount'. With this addition, 'hitcount' is also supported as an explicit value field, as is any numeric trace event field. This expands the hist trigger syntax from this: # echo hist:keys=xxx [ if filter] > event/trigger to this: # echo hist:keys=xxx:vals=yyy [ if filter] > event/trigger Link: http://lkml.kernel.org/r/2a5d1adb5ba6c65d7bb2148e379f2fed47f29a68.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 +++--- kernel/trace/trace_events_hist.c | 79 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6cf8fd03b028..bb62f54c5480 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3834,14 +3834,16 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=\n" + "\t [:values=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" - "\t table using the key named, and the value of a sum called\n" - "\t 'hitcount' is incremented. Keys correspond to fields in the\n" - "\t event's format description. Keys can be any field. The\n" - "\t 'size' parameter can be used to specify more or fewer than\n" - "\t the default 2048 entries for the hashtable size.\n\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field. Values must correspond to numeric fields.\n" + "\t The 'size' parameter can be used to specify more or fewer\n" + "\t than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout." #endif diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 23b45e462117..1b33590b9c8f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -84,6 +84,7 @@ enum hist_field_flags { struct hist_trigger_attrs { char *keys_str; + char *vals_str; unsigned int map_bits; }; @@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) return; kfree(attrs->keys_str); + kfree(attrs->vals_str); kfree(attrs); } @@ -183,6 +185,10 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) if ((strncmp(str, "key=", strlen("key=")) == 0) || (strncmp(str, "keys=", strlen("keys=")) == 0)) attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) + attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -276,13 +282,70 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data) return 0; } +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + int ret = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + field = trace_find_event_field(file->event_call, field_str); + if (!field) { + ret = -EINVAL; + goto out; + } + + hist_data->fields[val_idx] = create_hist_field(field, flags); + if (!hist_data->fields[val_idx]) { + ret = -ENOMEM; + goto out; + } + + ++hist_data->n_vals; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + ret = -EINVAL; + out: + return ret; +} + static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + char *fields_str, *field_str; + unsigned int i, j; int ret; ret = create_hitcount_val(hist_data); + if (ret) + goto out; + fields_str = hist_data->attrs->vals_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && + j < TRACING_MAP_VALS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + if (strcmp(field_str, "hitcount") == 0) + continue; + ret = create_val_field(hist_data, j++, file, field_str); + if (ret) + goto out; + } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) + ret = -EINVAL; + out: return ret; } @@ -552,6 +615,12 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } + seq_puts(m, "\n"); } @@ -659,7 +728,15 @@ static int event_hist_trigger_print(struct seq_file *m, } seq_puts(m, ":vals="); - seq_puts(m, "hitcount"); + + for_each_hist_val_field(i, hist_data) { + if (i == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + seq_puts(m, ","); + hist_field_print(m, hist_data->fields[i]); + } + } seq_puts(m, ":sort="); seq_puts(m, "hitcount"); -- cgit v1.2.3 From 76a3b0c8ac344e1d0f436160cbb59b670b086947 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:44 -0600 Subject: tracing: Add hist trigger support for compound keys Allow users to specify multiple trace event fields to use in keys by allowing multiple fields in the 'keys=' keyword. With this addition, any unique combination of any of the fields named in the 'keys' keyword will result in a new entry being added to the hash table. Link: http://lkml.kernel.org/r/0cfa24e6ac3b0dcece7737d94aa1f322ae3afc4b.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++---- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62f54c5480..7a4c4dcf0bfe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3833,7 +3833,7 @@ static const char readme_msg[] = "\t Filters can be ignored when removing a trigger.\n" #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" - "\t Format: hist:keys=\n" + "\t Format: hist:keys=\n" "\t [:values=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" @@ -3841,9 +3841,11 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field. Values must correspond to numeric fields.\n" - "\t The 'size' parameter can be used to specify more or fewer\n" - "\t than the default 2048 entries for the hashtable size.\n\n" + "\t can be any field. Compound keys consisting of up to two\n" + "\t fields can be specified by the 'keys' keyword. Values must\n" + "\t correspond to numeric fields. The 'size' parameter can be\n" + "\t used to specify more or fewer than the default 2048 entries\n" + "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout." #endif diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b33590b9c8f..65fdfc6cb633 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -32,6 +32,7 @@ struct hist_field { unsigned long flags; hist_field_fn_t fn; unsigned int size; + unsigned int offset; }; static u64 hist_field_counter(struct hist_field *field, void *event) @@ -73,8 +74,7 @@ DEFINE_HIST_FIELD_FN(u8); for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) #define HITCOUNT_IDX 0 -#define HIST_KEY_MAX 1 -#define HIST_KEY_SIZE_MAX MAX_FILTER_STR_VAL +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, @@ -351,6 +351,7 @@ static int create_val_fields(struct hist_trigger_data *hist_data, static int create_key_field(struct hist_trigger_data *hist_data, unsigned int key_idx, + unsigned int key_offset, struct trace_event_file *file, char *field_str) { @@ -380,7 +381,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, key_size = ALIGN(key_size, sizeof(u64)); hist_data->fields[key_idx]->size = key_size; - hist_data->key_size = key_size; + hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; if (hist_data->key_size > HIST_KEY_SIZE_MAX) { ret = -EINVAL; goto out; @@ -399,7 +401,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, static int create_key_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { - unsigned int i, n_vals = hist_data->n_vals; + unsigned int i, key_offset = 0, n_vals = hist_data->n_vals; char *fields_str, *field_str; int ret = -EINVAL; @@ -411,13 +413,15 @@ static int create_key_fields(struct hist_trigger_data *hist_data, if (!fields_str) goto out; - for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) { + for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { field_str = strsep(&fields_str, ","); if (!field_str) break; - ret = create_key_field(hist_data, i, file, field_str); + ret = create_key_field(hist_data, i, key_offset, + file, field_str); if (ret < 0) goto out; + key_offset += ret; } if (fields_str) { ret = -EINVAL; @@ -482,7 +486,10 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) else cmp_fn = tracing_map_cmp_num(field->size, field->is_signed); - idx = tracing_map_add_key_field(map, 0, cmp_fn); + idx = tracing_map_add_key_field(map, + hist_field->offset, + cmp_fn); + } else idx = tracing_map_add_sum_field(map); @@ -562,12 +569,16 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + char compound_key[HIST_KEY_SIZE_MAX]; struct hist_field *key_field; struct tracing_map_elt *elt; u64 field_contents; void *key = NULL; unsigned int i; + if (hist_data->n_keys > 1) + memset(compound_key, 0, hist_data->key_size); + for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -576,8 +587,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = (void *)(unsigned long)field_contents; else key = (void *)&field_contents; + + if (hist_data->n_keys > 1) { + memcpy(compound_key + key_field->offset, key, + key_field->size); + } } + if (hist_data->n_keys > 1) + key = compound_key; + elt = tracing_map_insert(hist_data->map, key); if (elt) hist_trigger_elt_update(hist_data, elt, rec); @@ -602,11 +621,11 @@ hist_trigger_entry_print(struct seq_file *m, if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, - (char *)key); + (char *)(key + key_field->offset)); } else { - uval = *(u64 *)key; - seq_printf(m, "%s: %10llu", - key_field->field->name, uval); + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %10llu", key_field->field->name, + uval); } } -- cgit v1.2.3 From e62347d2453474fa514fbbbc4636313d34d3c850 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:45 -0600 Subject: tracing: Add hist trigger support for user-defined sorting ('sort=' param) Allow users to specify keys and/or values to sort on. With this addition, keys and values specified using the 'keys=' and 'vals=' keywords can be used to sort the hist trigger output via a new 'sort=' keyword. If multiple sort keys are specified, the output will be sorted using the second key as a secondary sort key, etc. The default sort order is ascending; if the user wants a different sort order, '.descending' can be appended to the specific sort key. Before this addition, output was always sorted by 'hitcount' in ascending order. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy \ [ if filter] > event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/b30a41db66ba486979c4f987aff5fab500ea53b3.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++- kernel/trace/trace_events_hist.c | 112 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7a4c4dcf0bfe..d16fa8e1cdbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3835,6 +3835,7 @@ static const char readme_msg[] = " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=\n" "\t [:values=]\n" + "\t [:sort=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" @@ -3843,7 +3844,10 @@ static const char readme_msg[] = "\t correspond to fields in the event's format description. Keys\n" "\t can be any field. Compound keys consisting of up to two\n" "\t fields can be specified by the 'keys' keyword. Values must\n" - "\t correspond to numeric fields. The 'size' parameter can be\n" + "\t correspond to numeric fields. Sort keys consisting of up to\n" + "\t two fields can be specified using the 'sort' keyword. The\n" + "\t sort direction can be modified by appending '.descending' or\n" + "\t '.ascending' to a sort field. The 'size' parameter can be\n" "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65fdfc6cb633..03ba4530c08c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -85,6 +85,7 @@ enum hist_field_flags { struct hist_trigger_attrs { char *keys_str; char *vals_str; + char *sort_key_str; unsigned int map_bits; }; @@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) if (!attrs) return; + kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); kfree(attrs); @@ -189,6 +191,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) (strncmp(str, "vals=", strlen("vals=")) == 0) || (strncmp(str, "values=", strlen("values=")) == 0)) attrs->vals_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "sort=", strlen("sort=")) == 0) + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -450,12 +454,92 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, return ret; } +static int is_descending(const char *str) +{ + if (!str) + return 0; + + if (strcmp(str, "descending") == 0) + return 1; + + if (strcmp(str, "ascending") == 0) + return 0; + + return -EINVAL; +} + static int create_sort_keys(struct hist_trigger_data *hist_data) { - int ret = 0; + char *fields_str = hist_data->attrs->sort_key_str; + struct ftrace_event_field *field = NULL; + struct tracing_map_sort_key *sort_key; + int descending, ret = 0; + unsigned int i, j; + + hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ + + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + char *field_str, *field_name; + + sort_key = &hist_data->sort_keys[i]; + + field_str = strsep(&fields_str, ","); + if (!field_str) { + if (i == 0) + ret = -EINVAL; + break; + } + + if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + ret = -EINVAL; + break; + } - hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */ + field_name = strsep(&field_str, "."); + if (!field_name) { + ret = -EINVAL; + break; + } + + if (strcmp(field_name, "hitcount") == 0) { + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + break; + } + sort_key->descending = descending; + continue; + } + for (j = 1; j < hist_data->n_fields; j++) { + field = hist_data->fields[j]->field; + if (field && (strcmp(field_name, field->name) == 0)) { + sort_key->field_idx = j; + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + goto out; + } + sort_key->descending = descending; + break; + } + } + if (j == hist_data->n_fields) { + ret = -EINVAL; + break; + } + } + hist_data->n_sort_keys = i; + out: return ret; } @@ -758,7 +842,29 @@ static int event_hist_trigger_print(struct seq_file *m, } seq_puts(m, ":sort="); - seq_puts(m, "hitcount"); + + for (i = 0; i < hist_data->n_sort_keys; i++) { + struct tracing_map_sort_key *sort_key; + + sort_key = &hist_data->sort_keys[i]; + + if (i > 0) + seq_puts(m, ","); + + if (sort_key->field_idx == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + unsigned int idx = sort_key->field_idx; + + if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + hist_field_print(m, hist_data->fields[idx]); + } + + if (sort_key->descending) + seq_puts(m, ".descending"); + } seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); -- cgit v1.2.3 From 83e99914c9e2677d8a80f2a23eca0d215d5bfb0f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:46 -0600 Subject: tracing: Add hist trigger support for pausing and continuing a trace Allow users to append 'pause' or 'continue' to an existing trigger in order to have it paused or to have a paused trace continue. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \ [ if filter] >> event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause or cont \ [ if filter] >> event/trigger Link: http://lkml.kernel.org/r/b672a92c14702cb924cdf6fc27ea1809bed04907.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace_events_hist.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d16fa8e1cdbf..7d5c63dd410a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3837,6 +3837,7 @@ static const char readme_msg[] = "\t [:values=]\n" "\t [:sort=]\n" "\t [:size=#entries]\n" + "\t [:pause][:continue]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3851,7 +3852,11 @@ static const char readme_msg[] = "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout." + "\t table in its entirety to stdout.\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 03ba4530c08c..e3fa9c89f125 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -86,6 +86,8 @@ struct hist_trigger_attrs { char *keys_str; char *vals_str; char *sort_key_str; + bool pause; + bool cont; unsigned int map_bits; }; @@ -193,6 +195,11 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "sort=", strlen("sort=")) == 0) attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strcmp(str, "pause") == 0) + attrs->pause = true; + else if ((strcmp(str, "cont") == 0) || + (strcmp(str, "continue") == 0)) + attrs->cont = true; else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -871,7 +878,10 @@ static int event_hist_trigger_print(struct seq_file *m, if (data->filter_str) seq_printf(m, " if %s", data->filter_str); - seq_puts(m, " [active]"); + if (data->paused) + seq_puts(m, " [paused]"); + else + seq_puts(m, " [active]"); seq_putc(m, '\n'); @@ -910,16 +920,30 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { + struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test; int ret = 0; list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - ret = -EEXIST; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else + ret = -EEXIST; goto out; } } + if (hist_data->attrs->cont) { + ret = -ENOENT; + goto out; + } + + if (hist_data->attrs->pause) + data->paused = true; + if (data->ops->init) { ret = data->ops->init(data->ops, data); if (ret < 0) @@ -1011,7 +1035,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, * triggers registered a failure too. */ if (!ret) { - ret = -ENOENT; + if (!(attrs->pause || attrs->cont)) + ret = -ENOENT; goto out_free; } else if (ret < 0) goto out_free; -- cgit v1.2.3 From e86ae9baacfa9efe957df4fc037f079772636d76 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:47 -0600 Subject: tracing: Add hist trigger support for clearing a trace Allow users to append 'clear' to an existing trigger in order to have the hash table cleared. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont \ [ if filter] >> event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont/clear \ [ if filter] >> event/trigger Link: http://lkml.kernel.org/r/ae15dd0d9b2f7af07a37c1ff682063e2dbcdf160.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- kernel/trace/trace_events_hist.c | 24 ++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d5c63dd410a..4097cd3763f7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3837,7 +3837,7 @@ static const char readme_msg[] = "\t [:values=]\n" "\t [:sort=]\n" "\t [:size=#entries]\n" - "\t [:pause][:continue]\n" + "\t [:pause][:continue][:clear]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3857,6 +3857,9 @@ static const char readme_msg[] = "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e3fa9c89f125..83cb25e067ad 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -88,6 +88,7 @@ struct hist_trigger_attrs { char *sort_key_str; bool pause; bool cont; + bool clear; unsigned int map_bits; }; @@ -200,6 +201,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) attrs->cont = true; + else if (strcmp(str, "clear") == 0) + attrs->clear = true; else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -916,6 +919,21 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, return &event_hist_trigger_ops; } +static void hist_clear(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + bool paused; + + paused = data->paused; + data->paused = true; + + synchronize_sched(); + + tracing_map_clear(hist_data->map); + + data->paused = paused; +} + static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -930,13 +948,15 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, test->paused = true; else if (hist_data->attrs->cont) test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); else ret = -EEXIST; goto out; } } - if (hist_data->attrs->cont) { + if (hist_data->attrs->cont || hist_data->attrs->clear) { ret = -ENOENT; goto out; } @@ -1035,7 +1055,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, * triggers registered a failure too. */ if (!ret) { - if (!(attrs->pause || attrs->cont)) + if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; } else if (ret < 0) -- cgit v1.2.3 From 0c4a6b4666e8eb86dead3f09b40bb8ca4f614e4f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:48 -0600 Subject: tracing: Add hist trigger 'hex' modifier for displaying numeric fields Allow users to have numeric fields displayed as hex values in the output by appending '.hex' to field names: # echo hist:keys=aaa,bbb.hex:vals=ccc.hex ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/67bd431edda2af5798d7694818f7e8d71b6b3463.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++- kernel/trace/trace_events_hist.c | 62 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4097cd3763f7..2eb2eafbe7f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3852,7 +3852,10 @@ static const char readme_msg[] = "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout.\n\n" + "\t table in its entirety to stdout. The default format used to\n" + "\t display a given field can be modified by appending any of the\n" + "\t following modifiers to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 83cb25e067ad..8e49eeef8867 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -80,6 +80,7 @@ enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, HIST_FIELD_FL_KEY = 2, HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, }; struct hist_trigger_attrs { @@ -303,11 +304,23 @@ static int create_val_field(struct hist_trigger_data *hist_data, { struct ftrace_event_field *field = NULL; unsigned long flags = 0; + char *field_name; int ret = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) return -EINVAL; - field = trace_find_event_field(file->event_call, field_str); + + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); if (!field) { ret = -EINVAL; goto out; @@ -372,6 +385,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct ftrace_event_field *field = NULL; unsigned long flags = 0; unsigned int key_size; + char *field_name; int ret = 0; if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) @@ -379,7 +393,17 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_KEY; - field = trace_find_event_field(file->event_call, field_str); + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); if (!field) { ret = -EINVAL; goto out; @@ -713,7 +737,11 @@ hist_trigger_entry_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ", "); - if (key_field->flags & HIST_FIELD_FL_STRING) { + if (key_field->flags & HIST_FIELD_FL_HEX) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %llx", + key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); } else { @@ -729,9 +757,15 @@ hist_trigger_entry_print(struct seq_file *m, tracing_map_read_sum(elt, HITCOUNT_IDX)); for (i = 1; i < hist_data->n_vals; i++) { - seq_printf(m, " %s: %10llu", - hist_data->fields[i]->field->name, - tracing_map_read_sum(elt, i)); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } else { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } } seq_puts(m, "\n"); @@ -816,9 +850,25 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + + return flags_str; +} + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { seq_printf(m, "%s", hist_field->field->name); + if (hist_field->flags) { + const char *flags_str = get_hist_field_flags(hist_field); + + if (flags_str) + seq_printf(m, ".%s", flags_str); + } } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3 From c6afad49d127f6d7c9957319f55173a2198b1ba8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:49 -0600 Subject: tracing: Add hist trigger 'sym' and 'sym-offset' modifiers Allow users to have address fields displayed as symbols in the output by appending '.sym' or 'sym-offset' to field names: # echo hist:keys=aaa.sym,bbb.sym-offset ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/87d4935821491c0275513f0fbfb9bab8d3d3f079.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- kernel/trace/trace_events_hist.c | 29 +++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2eb2eafbe7f5..34459eb0c844 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3855,7 +3855,9 @@ static const char readme_msg[] = "\t table in its entirety to stdout. The default format used to\n" "\t display a given field can be modified by appending any of the\n" "\t following modifiers to the field name, as applicable:\n\n" - "\t .hex display a number as a hex value\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e49eeef8867..808cc06cdb61 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -77,10 +77,12 @@ DEFINE_HIST_FIELD_FN(u8); #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) enum hist_field_flags { - HIST_FIELD_FL_HITCOUNT = 1, - HIST_FIELD_FL_KEY = 2, - HIST_FIELD_FL_STRING = 4, - HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_SYM = 16, + HIST_FIELD_FL_SYM_OFFSET = 32, }; struct hist_trigger_attrs { @@ -397,6 +399,10 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (field_str) { if (strcmp(field_str, "hex") == 0) flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; else { ret = -EINVAL; goto out; @@ -726,6 +732,7 @@ hist_trigger_entry_print(struct seq_file *m, struct tracing_map_elt *elt) { struct hist_field *key_field; + char str[KSYM_SYMBOL_LEN]; unsigned int i; u64 uval; @@ -741,6 +748,16 @@ hist_trigger_entry_print(struct seq_file *m, uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %llx", key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol_no_offset(str, uval); + seq_printf(m, "%s: [%llx] %-45s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol(str, uval); + seq_printf(m, "%s: [%llx] %-55s", + key_field->field->name, uval, str); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -856,6 +873,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_HEX) flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; return flags_str; } -- cgit v1.2.3 From 6b4827ad028a1ab2fc4dcf1f5e6e077018d1b770 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:50 -0600 Subject: tracing: Add hist trigger 'execname' modifier Allow users to have common_pid field values displayed as program names in the output by appending '.execname' to a common_pid field name: # echo hist:keys=common_pid.execname ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/e172e81f10f5b8d1f08450e3763c850f39fbf698.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 100 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 34459eb0c844..58a8bee50c6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3857,7 +3857,8 @@ static const char readme_msg[] = "\t following modifiers to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" - "\t .sym-offset display an address as a symbol and offset\n\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 808cc06cdb61..227e6c21b1eb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -83,6 +83,7 @@ enum hist_field_flags { HIST_FIELD_FL_HEX = 8, HIST_FIELD_FL_SYM = 16, HIST_FIELD_FL_SYM_OFFSET = 32, + HIST_FIELD_FL_EXECNAME = 64, }; struct hist_trigger_attrs { @@ -232,6 +233,73 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) return ERR_PTR(ret); } +static inline void save_comm(char *comm, struct task_struct *task) +{ + if (!task->pid) { + strcpy(comm, ""); + return; + } + + if (WARN_ON_ONCE(task->pid < 0)) { + strcpy(comm, ""); + return; + } + + memcpy(comm, task->comm, TASK_COMM_LEN); +} + +static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +{ + kfree((char *)elt->private_data); +} + +static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +{ + struct hist_trigger_data *hist_data = elt->map->private_data; + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + unsigned int size = TASK_COMM_LEN + 1; + + elt->private_data = kzalloc(size, GFP_KERNEL); + if (!elt->private_data) + return -ENOMEM; + break; + } + } + + return 0; +} + +static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, + struct tracing_map_elt *from) +{ + char *comm_from = from->private_data; + char *comm_to = to->private_data; + + if (comm_from) + memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); +} + +static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +{ + char *comm = elt->private_data; + + if (comm) + save_comm(comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_comm_ops = { + .elt_alloc = hist_trigger_elt_comm_alloc, + .elt_copy = hist_trigger_elt_comm_copy, + .elt_free = hist_trigger_elt_comm_free, + .elt_init = hist_trigger_elt_comm_init, +}; + static void destroy_hist_field(struct hist_field *hist_field) { kfree(hist_field); @@ -403,6 +471,9 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_SYM; else if (strcmp(field_str, "sym-offset") == 0) flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; else { ret = -EINVAL; goto out; @@ -624,11 +695,27 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) return 0; } +static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) +{ + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) + return true; + } + + return false; +} + static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, struct trace_event_file *file) { + const struct tracing_map_ops *map_ops = NULL; struct hist_trigger_data *hist_data; int ret = 0; @@ -646,8 +733,11 @@ create_hist_data(unsigned int map_bits, if (ret) goto free; + if (need_tracing_map_ops(hist_data)) + map_ops = &hist_trigger_elt_comm_ops; + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, - NULL, hist_data); + map_ops, hist_data); if (IS_ERR(hist_data->map)) { ret = PTR_ERR(hist_data->map); hist_data->map = NULL; @@ -758,6 +848,12 @@ hist_trigger_entry_print(struct seq_file *m, sprint_symbol(str, uval); seq_printf(m, "%s: [%llx] %-55s", key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + char *comm = elt->private_data; + + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %-16s[%10llu]", + key_field->field->name, comm, uval); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -877,6 +973,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "sym"; else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; return flags_str; } -- cgit v1.2.3 From 316961988b5ec71bbf4b2ad447662770349aec13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:51 -0600 Subject: tracing: Add hist trigger 'syscall' modifier Allow users to have syscall id fields displayed as syscall names in the output by appending '.syscall' to field names: # echo hist:keys=aaa.syscall ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/2bab1e59933d76a14b545bd2e02f80b8b08ac4d3.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- kernel/trace/trace_events_hist.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 58a8bee50c6e..0dd23c5b1c34 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3858,7 +3858,8 @@ static const char readme_msg[] = "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" - "\t .execname display a common_pid as a program name\n\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 227e6c21b1eb..f9e7ecb33847 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -84,6 +84,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYM = 16, HIST_FIELD_FL_SYM_OFFSET = 32, HIST_FIELD_FL_EXECNAME = 64, + HIST_FIELD_FL_SYSCALL = 128, }; struct hist_trigger_attrs { @@ -474,6 +475,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, else if ((strcmp(field_str, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; else { ret = -EINVAL; goto out; @@ -854,6 +857,16 @@ hist_trigger_entry_print(struct seq_file *m, uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %-16s[%10llu]", key_field->field->name, comm, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { + const char *syscall_name; + + uval = *(u64 *)(key + key_field->offset); + syscall_name = get_syscall_name(uval); + if (!syscall_name) + syscall_name = "unknown_syscall"; + + seq_printf(m, "%s: %-30s[%3llu]", + key_field->field->name, syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -975,6 +988,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "sym-offset"; else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; return flags_str; } -- cgit v1.2.3 From 69a0200c2e25d61c50091549d00cfeb426c258f5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:52 -0600 Subject: tracing: Add hist trigger support for stacktraces as keys It's often useful to be able to use a stacktrace as a hash key, for keeping a count of the number of times a particular call path resulted in a trace event, for instance. Add a special key named 'stacktrace' which can be used as key in a 'keys=' param for this purpose: # echo hist:keys=stacktrace ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/87515e90b3785232a874a12156174635a348edb1.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 ++--- kernel/trace/trace_events_hist.c | 135 +++++++++++++++++++++++++++++---------- 2 files changed, 109 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0dd23c5b1c34..2238bfde799b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3843,14 +3843,14 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field. Compound keys consisting of up to two\n" - "\t fields can be specified by the 'keys' keyword. Values must\n" - "\t correspond to numeric fields. Sort keys consisting of up to\n" - "\t two fields can be specified using the 'sort' keyword. The\n" - "\t sort direction can be modified by appending '.descending' or\n" - "\t '.ascending' to a sort field. The 'size' parameter can be\n" - "\t used to specify more or fewer than the default 2048 entries\n" - "\t for the hashtable size.\n\n" + "\t can be any field, or the special string 'stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. The default format used to\n" "\t display a given field can be modified by appending any of the\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f9e7ecb33847..21cae42b54da 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -35,6 +35,11 @@ struct hist_field { unsigned int offset; }; +static u64 hist_field_none(struct hist_field *field, void *event) +{ + return 0; +} + static u64 hist_field_counter(struct hist_field *field, void *event) { return 1; @@ -73,8 +78,12 @@ DEFINE_HIST_FIELD_FN(u8); #define for_each_hist_key_field(i, hist_data) \ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + #define HITCOUNT_IDX 0 -#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, @@ -85,6 +94,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYM_OFFSET = 32, HIST_FIELD_FL_EXECNAME = 64, HIST_FIELD_FL_SYSCALL = 128, + HIST_FIELD_FL_STACKTRACE = 256, }; struct hist_trigger_attrs { @@ -323,6 +333,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (flags & HIST_FIELD_FL_STACKTRACE) { + hist_field->fn = hist_field_none; + goto out; + } + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; hist_field->fn = hist_field_string; @@ -456,7 +471,6 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct ftrace_event_field *field = NULL; unsigned long flags = 0; unsigned int key_size; - char *field_name; int ret = 0; if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) @@ -464,33 +478,39 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_KEY; - field_name = strsep(&field_str, "."); - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else if (strcmp(field_str, "sym") == 0) - flags |= HIST_FIELD_FL_SYM; - else if (strcmp(field_str, "sym-offset") == 0) - flags |= HIST_FIELD_FL_SYM_OFFSET; - else if ((strcmp(field_str, "execname") == 0) && - (strcmp(field_name, "common_pid") == 0)) - flags |= HIST_FIELD_FL_EXECNAME; - else if (strcmp(field_str, "syscall") == 0) - flags |= HIST_FIELD_FL_SYSCALL; - else { + if (strcmp(field_str, "stacktrace") == 0) { + flags |= HIST_FIELD_FL_STACKTRACE; + key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + } else { + char *field_name = strsep(&field_str, "."); + + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { ret = -EINVAL; goto out; } - } - field = trace_find_event_field(file->event_call, field_name); - if (!field) { - ret = -EINVAL; - goto out; + key_size = field->size; } - key_size = field->size; - hist_data->fields[key_idx] = create_hist_field(field, flags); if (!hist_data->fields[key_idx]) { ret = -ENOMEM; @@ -679,7 +699,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) field = hist_field->field; - if (is_string_field(field)) + if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + cmp_fn = tracing_map_cmp_none; + else if (is_string_field(field)) cmp_fn = tracing_map_cmp_string; else cmp_fn = tracing_map_cmp_num(field->size, @@ -786,7 +808,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + unsigned long entries[HIST_STACKTRACE_DEPTH]; char compound_key[HIST_KEY_SIZE_MAX]; + struct stack_trace stacktrace; struct hist_field *key_field; struct tracing_map_elt *elt; u64 field_contents; @@ -799,15 +823,27 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; - field_contents = key_field->fn(key_field, rec); - if (key_field->flags & HIST_FIELD_FL_STRING) - key = (void *)(unsigned long)field_contents; - else - key = (void *)&field_contents; + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + stacktrace.max_entries = HIST_STACKTRACE_DEPTH; + stacktrace.entries = entries; + stacktrace.nr_entries = 0; + stacktrace.skip = HIST_STACKTRACE_SKIP; - if (hist_data->n_keys > 1) { - memcpy(compound_key + key_field->offset, key, - key_field->size); + memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); + save_stack_trace(&stacktrace); + + key = entries; + } else { + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) + key = (void *)(unsigned long)field_contents; + else + key = (void *)&field_contents; + + if (hist_data->n_keys > 1) { + memcpy(compound_key + key_field->offset, key, + key_field->size); + } } } @@ -819,6 +855,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) hist_trigger_elt_update(hist_data, elt, rec); } +static void hist_trigger_stacktrace_print(struct seq_file *m, + unsigned long *stacktrace_entries, + unsigned int max_entries) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned int spaces = 8; + unsigned int i; + + for (i = 0; i < max_entries; i++) { + if (stacktrace_entries[i] == ULONG_MAX) + return; + + seq_printf(m, "%*c", 1 + spaces, ' '); + sprint_symbol(str, stacktrace_entries[i]); + seq_printf(m, "%s\n", str); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, void *key, @@ -826,6 +880,7 @@ hist_trigger_entry_print(struct seq_file *m, { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; + bool multiline = false; unsigned int i; u64 uval; @@ -867,6 +922,12 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, "%s: %-30s[%3llu]", key_field->field->name, syscall_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + seq_puts(m, "stacktrace:\n"); + hist_trigger_stacktrace_print(m, + key + key_field->offset, + HIST_STACKTRACE_DEPTH); + multiline = true; } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -877,7 +938,10 @@ hist_trigger_entry_print(struct seq_file *m, } } - seq_puts(m, " }"); + if (!multiline) + seq_puts(m, " "); + + seq_puts(m, "}"); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -1021,7 +1085,10 @@ static int event_hist_trigger_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ","); - hist_field_print(m, key_field); + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + seq_puts(m, "stacktrace"); + else + hist_field_print(m, key_field); } seq_puts(m, ":vals="); -- cgit v1.2.3 From 79e577cbce4c4c2bf0d64ec315adb04eda40736b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 3 Mar 2016 12:54:53 -0600 Subject: tracing: Support string type key properly The string in a trace event is usually recorded as dynamic array which is variable length. But current hist code only support fixed length array so it cannot support most strings. This patch fixes it by checking filter_type of the field and get proper pointer with it. With this, it can get a histogram of exec() based on filenames like below: # cd /sys/kernel/tracing/events/sched/sched_process_exec # cat 'hist:key=filename' > trigger # ps PID TTY TIME CMD 1 ? 00:00:00 init 29 ? 00:00:00 sh 38 ? 00:00:00 ps # ls enable filter format hist id trigger # cat hist # trigger info: hist:keys=filename:vals=hitcount:sort=hitcount:size=2048 [active] { filename: /usr/bin/ps } hitcount: 1 { filename: /usr/bin/ls } hitcount: 1 { filename: /usr/bin/cat } hitcount: 1 Totals: Hits: 3 Entries: 3 Dropped: 0 Link: http://lkml.kernel.org/r/610180d6df0cfdf11ee205452f3b241dea657233.1457029949.git.tom.zanussi@linux.intel.com Cc: Tom Zanussi Cc: Masami Hiramatsu Signed-off-by: Namhyung Kim Tested-by: Masami Hiramatsu [ Added (unsigned long) typecast to fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21cae42b54da..418ff27cbbaf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -52,12 +52,28 @@ static u64 hist_field_string(struct hist_field *hist_field, void *event) return (u64)(unsigned long)addr; } +static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +{ + char **addr = (char **)(event + hist_field->field->offset); + + return (u64)(unsigned long)*addr; +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ - return (u64)*addr; \ + return (u64)(unsigned long)*addr; \ } DEFINE_HIST_FIELD_FN(s64); @@ -340,7 +356,13 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; - hist_field->fn = hist_field_string; + + if (field->filter_type == FILTER_STATIC_STRING) + hist_field->fn = hist_field_string; + else if (field->filter_type == FILTER_DYN_STRING) + hist_field->fn = hist_field_dynstring; + else + hist_field->fn = hist_field_pstring; } else { hist_field->fn = select_value_fn(field->size, field->is_signed); @@ -508,7 +530,10 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - key_size = field->size; + if (is_string_field(field)) /* should be last key field */ + key_size = HIST_KEY_SIZE_MAX - key_offset; + else + key_size = field->size; } hist_data->fields[key_idx] = create_hist_field(field, flags); @@ -841,8 +866,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = (void *)&field_contents; if (hist_data->n_keys > 1) { + /* ensure NULL-termination */ + size_t size = key_field->size - 1; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + + if (size > key_field->size - 1) + size = key_field->size - 1; + } + memcpy(compound_key + key_field->offset, key, - key_field->size); + size); } } } -- cgit v1.2.3 From 6a475cb17fe161dbde62eca7e4c2cf59edff182f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:54 -0600 Subject: tracing: Remove restriction on string position in hist trigger keys If we assume the maximum size for a string field, we don't have to worry about its position. Since we only allow two keys in a compound key and having more than one string key in a given compound key doesn't make much sense anyway, trading a bit of extra space instead of introducing an arbitrary restriction makes more sense. We also need to use the event field size for static strings when copying the contents, otherwise we get random garbage in the key. Also, cast string return values to avoid warnings on 32-bit compiles. Finally, rearrange the code without changing any functionality by moving the compound key updating code into a separate function. Link: http://lkml.kernel.org/r/8976e1ab04b66bc2700ad1ed0768a2de85ac1983.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 63 ++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 418ff27cbbaf..4f4041d76926 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -530,8 +530,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - if (is_string_field(field)) /* should be last key field */ - key_size = HIST_KEY_SIZE_MAX - key_offset; + if (is_string_field(field)) + key_size = MAX_FILTER_STR_VAL; else key_size = field->size; } @@ -830,9 +830,34 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, } } +static inline void add_to_key(char *compound_key, void *key, + struct hist_field *key_field, void *rec) +{ + size_t size = key_field->size; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + + /* ensure NULL-termination */ + if (size > key_field->size - 1) + size = key_field->size - 1; + } + + memcpy(compound_key + key_field->offset, key, size); +} + static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; char compound_key[HIST_KEY_SIZE_MAX]; struct stack_trace stacktrace; @@ -842,8 +867,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) void *key = NULL; unsigned int i; - if (hist_data->n_keys > 1) - memset(compound_key, 0, hist_data->key_size); + memset(compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -860,35 +884,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = entries; } else { field_contents = key_field->fn(key_field, rec); - if (key_field->flags & HIST_FIELD_FL_STRING) + if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; - else + use_compound_key = true; + } else key = (void *)&field_contents; - - if (hist_data->n_keys > 1) { - /* ensure NULL-termination */ - size_t size = key_field->size - 1; - - if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_PTR_STRING) - size = strlen(key); - - if (size > key_field->size - 1) - size = key_field->size - 1; - } - - memcpy(compound_key + key_field->offset, key, - size); - } } + + if (use_compound_key) + add_to_key(compound_key, key, key_field, rec); } - if (hist_data->n_keys > 1) + if (use_compound_key) key = compound_key; elt = tracing_map_insert(hist_data->map, key); -- cgit v1.2.3 From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:55 -0600 Subject: tracing: Add enable_hist/disable_hist triggers Similar to enable_event/disable_event triggers, these triggers enable and disable the aggregation of events into maps rather than enabling and disabling their writing into the trace buffer. They can be used to automatically start and stop hist triggers based on a matching filter condition. If there's a paused hist trigger on system:event, the following would start it when the filter condition was hit: # echo enable_hist:system:event [ if filter] > event/trigger And the following would disable a running system:event hist trigger: # echo disable_hist:system:event [ if filter] > event/trigger See Documentation/trace/events.txt for real examples. Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++ kernel/trace/trace.h | 32 ++++++++++ kernel/trace/trace_events_hist.c | 115 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 71 ++++++++++++---------- 4 files changed, 195 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2238bfde799b..8430145bea12 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3807,6 +3807,10 @@ static const char readme_msg[] = "\t trigger: traceon, traceoff\n" "\t enable_event::\n" "\t disable_event::\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist::\n" + "\t disable_hist::\n" +#endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif @@ -3867,6 +3871,10 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analagous to\n" + "\t the enable_event and disable_event triggers.\n" #endif ; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 505f8a45f426..cab1f4bfe85b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); +extern int register_trigger_hist_enable_disable_cmds(void); #else static inline int register_trigger_hist_cmd(void) { return 0; } +static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } #endif extern int register_trigger_cmds(void); @@ -1185,6 +1187,34 @@ struct event_trigger_data { struct list_head list; }; +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" +#define ENABLE_HIST_STR "enable_hist" +#define DISABLE_HIST_STR "disable_hist" + +struct enable_trigger_data { + struct trace_event_file *file; + bool enable; + bool hist; +}; + +extern int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); +extern int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file); +extern void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_ops *ops, struct event_trigger_data *data); @@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); extern int register_event_command(struct event_command *cmd); +extern int unregister_event_command(struct event_command *cmd); +extern int register_trigger_hist_enable_disable_cmds(void); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4f4041d76926..5d4f02792440 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void) return ret; } + +static void +hist_enable_trigger(struct event_trigger_data *data, void *rec) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (enable_data->enable) + test->paused = false; + else + test->paused = true; + break; + } + } +} + +static void +hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + hist_enable_trigger(data, rec); +} + +static struct event_trigger_ops hist_enable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_enable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops * +hist_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); + + if (enable) + ops = param ? &hist_enable_count_trigger_ops : + &hist_enable_trigger_ops; + else + ops = param ? &hist_disable_count_trigger_ops : + &hist_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_hist_enable_cmd = { + .name = ENABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_hist_disable_cmd = { + .name = DISABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_hist_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_hist_enable_cmd); + unregister_event_command(&trigger_hist_disable_cmd); +} + +__init int register_trigger_hist_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_hist_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_hist_enable_disable_cmds(); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d29092afe005..d133f2094566 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd) * Currently we only unregister event commands from __init, so mark * this __init too. */ -static __init int unregister_event_command(struct event_command *cmd) +__init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; int ret = -ENODEV; @@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) unregister_event_command(&trigger_traceoff_cmd); } -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct enable_trigger_data { - struct trace_event_file *file; - bool enable; -}; - static void event_enable_trigger(struct event_trigger_data *data, void *rec) { @@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) event_enable_trigger(data, rec); } -static int -event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; seq_printf(m, "%s:%s:%s", - enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->hist ? + (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) : + (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR), enable_data->file->event_call->class->system, trace_event_name(enable_data->file->event_call)); @@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, return 0; } -static void -event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -static int -event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, struct trace_array *tr = file->tr; const char *system; const char *event; + bool hist = false; char *trigger; char *number; bool enable; @@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops, if (!event_enable_file) goto out; - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#ifdef CONFIG_HIST_TRIGGERS + hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || + (strcmp(cmd, DISABLE_HIST_STR) == 0)); + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); ret = -ENOMEM; @@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->list); RCU_INIT_POINTER(trigger_data->filter, NULL); + enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; trigger_data->private_data = enable_data; @@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops, goto out; } -static int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob, list_for_each_entry_rcu(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && + (test->cmd_ops->trigger_type == + data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { ret = -EEXIST; goto out; @@ -1343,10 +1345,10 @@ out: return ret; } -static void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; @@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob, list_for_each_entry_rcu(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && + (data->cmd_ops->trigger_type == + test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); @@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param) struct event_trigger_ops *ops; bool enable; +#ifdef CONFIG_HIST_TRIGGERS + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; - +#endif if (enable) ops = param ? &event_enable_count_trigger_ops : &event_enable_trigger_ops; @@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_enable_disable_cmds(); register_trigger_hist_cmd(); return 0; -- cgit v1.2.3 From 52a7f16dedff8f23d03df3ea556dec95b92a5801 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:57 -0600 Subject: tracing: Add support for multiple hist triggers per event Allow users to define any number of hist triggers per trace event. Any number of hist triggers may be added for a given event, which may differ by key, value, or filter. Reading the event's 'hist' file will display the output of all the hist triggers defined on an event concatenated in the order they were defined. Link: http://lkml.kernel.org/r/48a0c8dd34c344571de880fb35e211c6d9a28961.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +- kernel/trace/trace_events_hist.c | 172 +++++++++++++++++++++++++++++++-------- 2 files changed, 145 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8430145bea12..e89b2ed76d2d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3856,9 +3856,11 @@ static const char readme_msg[] = "\t sort field. The 'size' parameter can be used to specify more\n" "\t or fewer than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout. The default format used to\n" - "\t display a given field can be modified by appending any of the\n" - "\t following modifiers to the field name, as applicable:\n\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The default format used to display a\n" + "\t given field can be modified by appending any of the following\n" + "\t modifiers to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5d4f02792440..4b02f8ab4dd3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1032,33 +1032,18 @@ static int print_entries(struct seq_file *m, return n_entries; } -static int hist_show(struct seq_file *m, void *v) +static void hist_trigger_show(struct seq_file *m, + struct event_trigger_data *data, int n) { - struct event_trigger_data *test, *data = NULL; - struct trace_event_file *event_file; struct hist_trigger_data *hist_data; int n_entries, ret = 0; - mutex_lock(&event_mutex); - - event_file = event_file_data(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } - - list_for_each_entry_rcu(test, &event_file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - data = test; - break; - } - } - if (!data) - goto out_unlock; + if (n > 0) + seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); data->ops->print(m, data->ops, data); - seq_puts(m, "\n"); + seq_puts(m, "#\n\n"); hist_data = data->private_data; n_entries = print_entries(m, hist_data); @@ -1070,6 +1055,27 @@ static int hist_show(struct seq_file *m, void *v) seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_show(m, data, n++); + } + out_unlock: mutex_unlock(&event_mutex); @@ -1233,6 +1239,54 @@ static void hist_clear(struct event_trigger_data *data) data->paused = paused; } +static bool hist_trigger_match(struct event_trigger_data *data, + struct event_trigger_data *data_test) +{ + struct tracing_map_sort_key *sort_key, *sort_key_test; + struct hist_trigger_data *hist_data, *hist_data_test; + struct hist_field *key_field, *key_field_test; + unsigned int i; + + hist_data = data->private_data; + hist_data_test = data_test->private_data; + + if (hist_data->n_vals != hist_data_test->n_vals || + hist_data->n_fields != hist_data_test->n_fields || + hist_data->n_sort_keys != hist_data_test->n_sort_keys) + return false; + + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + + for_each_hist_field(i, hist_data) { + key_field = hist_data->fields[i]; + key_field_test = hist_data_test->fields[i]; + + if (key_field->flags != key_field_test->flags) + return false; + if (key_field->field != key_field_test->field) + return false; + if (key_field->offset != key_field_test->offset) + return false; + } + + for (i = 0; i < hist_data->n_sort_keys; i++) { + sort_key = &hist_data->sort_keys[i]; + sort_key_test = &hist_data_test->sort_keys[i]; + + if (sort_key->field_idx != sort_key_test->field_idx || + sort_key->descending != sort_key_test->descending) + return false; + } + + if (data->filter_str && + (strcmp(data->filter_str, data_test->filter_str) != 0)) + return false; + + return true; +} + static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -1243,6 +1297,8 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test)) + continue; if (hist_data->attrs->pause) test->paused = true; else if (hist_data->attrs->cont) @@ -1282,6 +1338,44 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, return ret; } +static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + bool unregistered = false; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test)) + continue; + unregistered = true; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (unregistered && test->ops->free) + test->ops->free(test->ops, test); +} + +static void hist_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + static int event_hist_trigger_func(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param) @@ -1331,22 +1425,19 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger_data->private_data = hist_data; + /* if param is non-empty, it's supposed to be a filter */ + if (param && cmd_ops->set_filter) { + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + } + if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); ret = 0; goto out_free; } - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; - - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - out_reg: ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of triggers registered, @@ -1379,7 +1470,8 @@ static struct event_command trigger_hist_cmd = { .flags = EVENT_CMD_FL_NEEDS_REC, .func = event_hist_trigger_func, .reg = hist_register_trigger, - .unreg = unregister_trigger, + .unreg = hist_unregister_trigger, + .unreg_all = hist_unreg_all, .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1406,7 +1498,6 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) test->paused = false; else test->paused = true; - break; } } } @@ -1469,12 +1560,28 @@ hist_enable_get_trigger_ops(char *cmd, char *param) return ops; } +static void hist_enable_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { + list_del_rcu(&test->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, .func = event_enable_trigger_func, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1485,6 +1592,7 @@ static struct event_command trigger_hist_disable_cmd = { .func = event_enable_trigger_func, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; -- cgit v1.2.3 From db1388b4ffa9e31e9ff0abacc3bdb121bec8c688 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:58 -0600 Subject: tracing: Add support for named triggers Named triggers are sets of triggers that share a common set of trigger data. An example of functionality that could benefit from this type of capability would be a set of inlined probes that would each contribute event counts, for example, to a shared counter data structure. The first named trigger registered with a given name owns the common trigger data that the others subsequently registered with the same name will reference. The functions defined here allow users to add, delete, and find named triggers. It also adds functions to pause and unpause named triggers; since named triggers act upon common data, they should also be paused and unpaused as a group. Link: http://lkml.kernel.org/r/c09ff648360f65b10a3e321eddafe18060b4a04f.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 13 ++++ kernel/trace/trace_events_trigger.c | 143 ++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cab1f4bfe85b..727a3d28bce5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1184,7 +1184,11 @@ struct event_trigger_data { char *filter_str; void *private_data; bool paused; + bool paused_tmp; struct list_head list; + char *name; + struct list_head named_list; + struct event_trigger_data *named_data; }; /* Avoid typos */ @@ -1227,6 +1231,15 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); +extern struct event_trigger_data *find_named_trigger(const char *name); +extern bool is_named_trigger(struct event_trigger_data *test); +extern int save_named_trigger(const char *name, + struct event_trigger_data *data); +extern void del_named_trigger(struct event_trigger_data *data); +extern void pause_named_trigger(struct event_trigger_data *data); +extern void unpause_named_trigger(struct event_trigger_data *data); +extern void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d133f2094566..a975571cde24 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); @@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str, return ret; } +static LIST_HEAD(named_triggers); + +/** + * find_named_trigger - Find the common named trigger associated with @name + * @name: The name of the set of named triggers to find the common data for + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * returns the common trigger data associated with that first + * registered instance. + * + * Return: the common trigger data for the given named trigger on + * success, NULL otherwise. + */ +struct event_trigger_data *find_named_trigger(const char *name) +{ + struct event_trigger_data *data; + + if (!name) + return NULL; + + list_for_each_entry(data, &named_triggers, named_list) { + if (data->named_data) + continue; + if (strcmp(data->name, name) == 0) + return data; + } + + return NULL; +} + +/** + * is_named_trigger - determine if a given trigger is a named trigger + * @test: The trigger data to test + * + * Return: true if 'test' is a named trigger, false otherwise. + */ +bool is_named_trigger(struct event_trigger_data *test) +{ + struct event_trigger_data *data; + + list_for_each_entry(data, &named_triggers, named_list) { + if (test == data) + return true; + } + + return false; +} + +/** + * save_named_trigger - save the trigger in the named trigger list + * @name: The name of the named trigger set + * @data: The trigger data to save + * + * Return: 0 if successful, negative error otherwise. + */ +int save_named_trigger(const char *name, struct event_trigger_data *data) +{ + data->name = kstrdup(name, GFP_KERNEL); + if (!data->name) + return -ENOMEM; + + list_add(&data->named_list, &named_triggers); + + return 0; +} + +/** + * del_named_trigger - delete a trigger from the named trigger list + * @data: The trigger data to delete + */ +void del_named_trigger(struct event_trigger_data *data) +{ + kfree(data->name); + data->name = NULL; + + list_del(&data->named_list); +} + +static void __pause_named_trigger(struct event_trigger_data *data, bool pause) +{ + struct event_trigger_data *test; + + list_for_each_entry(test, &named_triggers, named_list) { + if (strcmp(test->name, data->name) == 0) { + if (pause) { + test->paused_tmp = test->paused; + test->paused = true; + } else { + test->paused = test->paused_tmp; + } + } + } +} + +/** + * pause_named_trigger - Pause all named triggers with the same name + * @data: The trigger data of a named trigger to pause + * + * Pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * pausing only one is meaningless, so pausing one named trigger needs + * to pause all triggers with the same name. + */ +void pause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, true); +} + +/** + * unpause_named_trigger - Un-pause all named triggers with the same name + * @data: The trigger data of a named trigger to unpause + * + * Un-pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * unpausing only one is meaningless, so unpausing one named trigger + * needs to unpause all triggers with the same name. + */ +void unpause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, false); +} + +/** + * set_named_trigger_data - Associate common named trigger data + * @data: The trigger data of a named trigger to unpause + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * associates the common trigger data from the first trigger with the + * given trigger. + */ +void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data) +{ + data->named_data = named_data; +} + static void traceon_trigger(struct event_trigger_data *data, void *rec) { -- cgit v1.2.3 From 5463bfda327b1f7310556ef3136533e27c774f13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:59 -0600 Subject: tracing: Add support for named hist triggers Allow users to define 'named' hist triggers. All triggers created with the same 'name=xxx' option will update the same shared histogram data. This expands the hist trigger syntax from this: # echo hist:keys=xxx ... [ if filter] > event/trigger to this: # echo hist:name=xxx:keys=xxx ... [ if filter] > event/trigger Named histograms must use a 'compatible' set of keys and values, which means each event added to a set of named triggers must have the same names and types. Reading the 'hist' file of any of the participating events will produce the same output as any other participating event, which is to be expected since they share the same data. Link: http://lkml.kernel.org/r/1dbc84ee3322a75daaf5b3ef1d0cc0a2fb682fc7.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 14 ++-- kernel/trace/trace_events_hist.c | 148 ++++++++++++++++++++++++++++++++++----- 2 files changed, 141 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e89b2ed76d2d..4e342d354c12 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3842,6 +3842,7 @@ static const char readme_msg[] = "\t [:sort=]\n" "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3854,13 +3855,18 @@ static const char readme_msg[] = "\t specified using the 'sort' keyword. The sort direction can\n" "\t be modified by appending '.descending' or '.ascending' to a\n" "\t sort field. The 'size' parameter can be used to specify more\n" - "\t or fewer than the default 2048 entries for the hashtable size.\n\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. If there are multiple hist\n" "\t triggers attached to an event, there will be a table for each\n" - "\t trigger in the output. The default format used to display a\n" - "\t given field can be modified by appending any of the following\n" - "\t modifiers to the field name, as applicable:\n\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4b02f8ab4dd3..bdea5603cbde 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -117,6 +117,7 @@ struct hist_trigger_attrs { char *keys_str; char *vals_str; char *sort_key_str; + char *name; bool pause; bool cont; bool clear; @@ -200,6 +201,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) if (!attrs) return; + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); @@ -227,6 +229,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "sort=", strlen("sort=")) == 0) attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "name=", strlen("name=")) == 0) + attrs->name = kstrdup(str, GFP_KERNEL); else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || @@ -1131,7 +1135,12 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_field *key_field; unsigned int i; - seq_puts(m, "hist:keys="); + seq_puts(m, "hist:"); + + if (data->name) + seq_printf(m, "%s:", data->name); + + seq_puts(m, "keys="); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -1196,6 +1205,19 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } +static int event_hist_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (!data->ref && hist_data->attrs->name) + save_named_trigger(hist_data->attrs->name, data); + + data->ref++; + + return 0; +} + static void event_hist_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data) { @@ -1206,6 +1228,8 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, data->ref--; if (!data->ref) { + if (data->name) + del_named_trigger(data); trigger_data_free(data); destroy_hist_data(hist_data); } @@ -1214,10 +1238,44 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, static struct event_trigger_ops event_hist_trigger_ops = { .func = event_hist_trigger, .print = event_hist_trigger_print, - .init = event_trigger_init, + .init = event_hist_trigger_init, .free = event_hist_trigger_free, }; +static int event_hist_trigger_named_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + + save_named_trigger(data->named_data->name, data); + + event_hist_trigger_init(ops, data->named_data); + + return 0; +} + +static void event_hist_trigger_named_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + event_hist_trigger_free(ops, data->named_data); + + data->ref--; + if (!data->ref) { + del_named_trigger(data); + trigger_data_free(data); + } +} + +static struct event_trigger_ops event_hist_trigger_named_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_named_init, + .free = event_hist_trigger_named_free, +}; + static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, char *param) { @@ -1227,26 +1285,54 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; - bool paused; - paused = data->paused; - data->paused = true; + if (data->name) + pause_named_trigger(data); synchronize_sched(); tracing_map_clear(hist_data->map); - data->paused = paused; + if (data->name) + unpause_named_trigger(data); +} + +static bool compatible_field(struct ftrace_event_field *field, + struct ftrace_event_field *test_field) +{ + if (field == test_field) + return true; + if (field == NULL || test_field == NULL) + return false; + if (strcmp(field->name, test_field->name) != 0) + return false; + if (strcmp(field->type, test_field->type) != 0) + return false; + if (field->size != test_field->size) + return false; + if (field->is_signed != test_field->is_signed) + return false; + + return true; } static bool hist_trigger_match(struct event_trigger_data *data, - struct event_trigger_data *data_test) + struct event_trigger_data *data_test, + struct event_trigger_data *named_data, + bool ignore_filter) { struct tracing_map_sort_key *sort_key, *sort_key_test; struct hist_trigger_data *hist_data, *hist_data_test; struct hist_field *key_field, *key_field_test; unsigned int i; + if (named_data && (named_data != data_test) && + (named_data != data_test->named_data)) + return false; + + if (!named_data && is_named_trigger(data_test)) + return false; + hist_data = data->private_data; hist_data_test = data_test->private_data; @@ -1255,9 +1341,11 @@ static bool hist_trigger_match(struct event_trigger_data *data, hist_data->n_sort_keys != hist_data_test->n_sort_keys) return false; - if ((data->filter_str && !data_test->filter_str) || - (!data->filter_str && data_test->filter_str)) - return false; + if (!ignore_filter) { + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + } for_each_hist_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -1265,7 +1353,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, if (key_field->flags != key_field_test->flags) return false; - if (key_field->field != key_field_test->field) + if (!compatible_field(key_field->field, key_field_test->field)) return false; if (key_field->offset != key_field_test->offset) return false; @@ -1280,7 +1368,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; } - if (data->filter_str && + if (!ignore_filter && data->filter_str && (strcmp(data->filter_str, data_test->filter_str) != 0)) return false; @@ -1292,12 +1380,26 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct trace_event_file *file) { struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test; + struct event_trigger_data *test, *named_data = NULL; int ret = 0; + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) { + ret = -EINVAL; + goto out; + } + } + } + + if (hist_data->attrs->name && !named_data) + goto new; + list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test)) + if (!hist_trigger_match(data, test, named_data, false)) continue; if (hist_data->attrs->pause) test->paused = true; @@ -1310,12 +1412,19 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } } - + new: if (hist_data->attrs->cont || hist_data->attrs->clear) { ret = -ENOENT; goto out; } + if (named_data) { + destroy_hist_data(data->private_data); + data->private_data = named_data->private_data; + set_named_trigger_data(data, named_data); + data->ops = &event_hist_trigger_named_ops; + } + if (hist_data->attrs->pause) data->paused = true; @@ -1329,6 +1438,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret++; update_cond_flag(file); + if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); update_cond_flag(file); @@ -1342,12 +1452,16 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { - struct event_trigger_data *test; + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test)) + if (!hist_trigger_match(data, test, named_data, false)) continue; unregistered = true; list_del_rcu(&test->list); -- cgit v1.2.3 From 4b94f5b7b4a5ffd885609bd033af2ecca0c9cc54 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 3 Mar 2016 12:55:02 -0600 Subject: tracing: Add hist trigger 'log2' modifier Allow users to have numeric fields displayed as log2 values in case value range is very wide by appending '.log2' to field names. For example, # echo 'hist:key=bytes_req' > kmalloc/trigger # cat kmalloc/hist { bytes_req: 504 } hitcount: 1 { bytes_req: 11 } hitcount: 1 { bytes_req: 104 } hitcount: 1 { bytes_req: 48 } hitcount: 1 { bytes_req: 2048 } hitcount: 1 { bytes_req: 4096 } hitcount: 1 { bytes_req: 240 } hitcount: 1 { bytes_req: 392 } hitcount: 1 { bytes_req: 13 } hitcount: 1 { bytes_req: 28 } hitcount: 1 { bytes_req: 12 } hitcount: 1 { bytes_req: 64 } hitcount: 2 { bytes_req: 128 } hitcount: 2 { bytes_req: 32 } hitcount: 2 { bytes_req: 8 } hitcount: 11 { bytes_req: 10 } hitcount: 13 { bytes_req: 24 } hitcount: 25 { bytes_req: 160 } hitcount: 29 { bytes_req: 16 } hitcount: 33 { bytes_req: 80 } hitcount: 36 When using '.log2' modifier, the output looks like: # echo 'hist:key=bytes_req.log2' > kmalloc/trigger # cat kmalloc/hist { bytes_req: ~ 2^12 } hitcount: 1 { bytes_req: ~ 2^11 } hitcount: 1 { bytes_req: ~ 2^9 } hitcount: 2 { bytes_req: ~ 2^6 } hitcount: 3 { bytes_req: ~ 2^3 } hitcount: 13 { bytes_req: ~ 2^5 } hitcount: 19 { bytes_req: ~ 2^8 } hitcount: 49 { bytes_req: ~ 2^7 } hitcount: 57 { bytes_req: ~ 2^4 } hitcount: 74 Link: http://lkml.kernel.org/r/7ff396b246c6a881f46b979735fddf05a0d6c71a.1457029949.git.tom.zanussi@linux.intel.com Cc: Tom Zanussi Cc: Masami Hiramatsu Signed-off-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + kernel/trace/trace_events_hist.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4e342d354c12..988a35263fdd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3872,6 +3872,7 @@ static const char readme_msg[] = "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n\n" + "\t .log2 display log2 value rather than raw number\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bdea5603cbde..23df38aab503 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -68,6 +68,13 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event) return (u64)(unsigned long)*addr; } +static u64 hist_field_log2(struct hist_field *hist_field, void *event) +{ + u64 val = *(u64 *)(event + hist_field->field->offset); + + return (u64) ilog2(roundup_pow_of_two(val)); +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ @@ -111,6 +118,7 @@ enum hist_field_flags { HIST_FIELD_FL_EXECNAME = 64, HIST_FIELD_FL_SYSCALL = 128, HIST_FIELD_FL_STACKTRACE = 256, + HIST_FIELD_FL_LOG2 = 512, }; struct hist_trigger_attrs { @@ -358,6 +366,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (flags & HIST_FIELD_FL_LOG2) { + hist_field->fn = hist_field_log2; + goto out; + } + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; @@ -522,6 +535,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_EXECNAME; else if (strcmp(field_str, "syscall") == 0) flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(field_str, "log2") == 0) + flags |= HIST_FIELD_FL_LOG2; else { ret = -EINVAL; goto out; @@ -980,6 +995,9 @@ hist_trigger_entry_print(struct seq_file *m, key + key_field->offset, HIST_STACKTRACE_DEPTH); multiline = true; + } else if (key_field->flags & HIST_FIELD_FL_LOG2) { + seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + *(u64 *)(key + key_field->offset)); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -1112,6 +1130,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "execname"; else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; return flags_str; } -- cgit v1.2.3 From d50c744ecde7ee3ba4d7ffb0e1c55e7a2f6bbc8e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 8 Mar 2016 17:17:15 -0500 Subject: tracing: Fix unsigned comparison to zero in hist trigger code Fengguang Wu's bot found two comparisons of unsigned integers to zero. These were real bugs, as it would miss error conditions returned to zero. trace_events_hist.c:426:6-9: WARNING: Unsigned expression compared with zero: idx < 0 trace_events_hist.c:568:5-14: WARNING: Unsigned expression compared with zero: n_entries < 0 Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 23df38aab503..f98b6b3a2804 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -734,7 +734,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) struct tracing_map *map = hist_data->map; struct ftrace_event_field *field; struct hist_field *hist_field; - unsigned int i, idx; + int i, idx; for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; @@ -1036,7 +1036,7 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - unsigned int i, n_entries; + int i, n_entries; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, -- cgit v1.2.3 From 205506228b69be4efbb3809957a69420f26a8d9f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 25 Apr 2016 22:40:12 -0400 Subject: tracing: Do not inherit event-fork option for instances As the event-fork option requires doing work when enabled and disabled, it can not be passed down to created instances. The instance must clear this flag when it is created, and must clear it when its removed. As more options may be created with this need, a macro ZEROED_TRACE_FLAGS is created that holds the flags that must not be inherited by the top level instance, and must be cleared on removal of instances. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 988a35263fdd..5e3ad3481e4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec) #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + TRACE_ITER_EVENT_FORK /* * The global_trace is the descriptor that holds the tracing @@ -6710,7 +6713,7 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; - tr->trace_flags = global_trace.trace_flags; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -6784,6 +6787,12 @@ static int instance_rmdir(const char *name) list_del(&tr->list); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1 << i, 0); + } + tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); -- cgit v1.2.3 From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 25 Apr 2016 23:22:35 -0400 Subject: ext4: fix races between changing inode journal mode and ext4_writepages In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- kernel/locking/percpu-rwsem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely -- cgit v1.2.3 From 4812952f9c94f67b3cc78ad0a6482bf182aa5a44 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 23 Apr 2016 13:23:47 +0300 Subject: tracing: checking for NULL instead of IS_ERR() tracing_map_elt_alloc() returns ERR_PTRs on error, never NULL. Fixes: 08d43a5fa063 ('tracing: Add lock-free tracing_map') Link: http://lkml.kernel.org/r/20160423102347.GA11136@mwanda Acked-by: Tom Zanussi Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index e0f172932eca..e7dfc5eec669 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -814,7 +814,7 @@ static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) unsigned int i; dup_elt = tracing_map_elt_alloc(elt->map); - if (!dup_elt) + if (IS_ERR(dup_elt)) return NULL; if (elt->map->ops && elt->map->ops->elt_copy) -- cgit v1.2.3 From 432480c58219eff32904b879eb3fcc1d268a3b06 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 25 Apr 2016 14:01:27 -0500 Subject: tracing: Add check for NULL event field when creating hist field Smatch flagged create_hist_field() as possibly being able to dereference a NULL pointer, although the current code exits in all cases where the event field could be NULL, so it's not actually a problem. Still, to prevent future changes to the code from overlooking new cases, make the NULL pointer check explicit and warn once in that case. Link: http://lkml.kernel.org/r/cfbc003f534a3e441b4313272fd412310aba6336.1461610073.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f98b6b3a2804..0c05b8a99806 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -371,6 +371,9 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (WARN_ON_ONCE(!field)) + goto out; + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; -- cgit v1.2.3 From 6e4cf657defa8fb06dbf9ed91adb78414758f259 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 25 Apr 2016 14:01:28 -0500 Subject: tracing: Handle tracing_map_alloc_elts() error path correctly If tracing_map_elt_alloc() fails, it will return ERR_PTR() instead of NULL, so change the check to IS_ERROR(). We also need to set the failed entry in the map->elts array to NULL instead of ERR_PTR() so tracing_map_free_elts() doesn't try freeing an ERR_PTR(). tracing_map_free_elts() should also zero out what it frees so a reentrant call won't find previously freed elements. Link: http://lkml.kernel.org/r/f29d03b00bce3aac8cf151a8a30e6c83e5fee66d.1461610073.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index e7dfc5eec669..0a689bbb78ef 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -366,10 +366,13 @@ static void tracing_map_free_elts(struct tracing_map *map) if (!map->elts) return; - for (i = 0; i < map->max_elts; i++) + for (i = 0; i < map->max_elts; i++) { tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + } tracing_map_array_free(map->elts); + map->elts = NULL; } static int tracing_map_alloc_elts(struct tracing_map *map) @@ -383,7 +386,8 @@ static int tracing_map_alloc_elts(struct tracing_map *map) for (i = 0; i < map->max_elts; i++) { *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); - if (!(*(TRACING_MAP_ELT(map->elts, i)))) { + if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) { + *(TRACING_MAP_ELT(map->elts, i)) = NULL; tracing_map_free_elts(map); return -ENOMEM; -- cgit v1.2.3 From 4afe6495e5cb3c352d95f07512cbb227e607e2ce Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Mon, 18 Apr 2016 15:23:29 +0800 Subject: tracing: Don't use the address of the buffer array name in copy_from_user With the following code snippet: ... char buf[64]; ... if (copy_from_user(&buf, ubuf, cnt)) ... Even though the value of "&buf" equals "buf", but there is no need to get the address of the "buf" again. Use "buf" instead of "&buf". Link: http://lkml.kernel.org/r/20160418152329.18b72bea@debian Signed-off-by: Wang Xiaoqiang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5e3ad3481e4b..46028d47d252 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3664,7 +3664,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -4537,7 +4537,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -5327,7 +5327,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; -- cgit v1.2.3 From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 21 Apr 2016 14:14:01 -0400 Subject: audit: add tty field to LOGIN event The tty field was missing from AUDIT_LOGIN events. Refactor code to create a new function audit_get_tty(), using it to replace the call in audit_log_task_info() and to add it to audit_log_set_loginuid(). Lock and bump the kref to protect it, adding audit_put_tty() alias to decrement it. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 18 +++++------------- kernel/auditsc.c | 8 ++++++-- 2 files changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f52fbefede09..384374a1d232 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -64,7 +64,6 @@ #include #endif #include -#include #include #include @@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; char comm[sizeof(tsk->comm)]; - char *tty; + struct tty_struct *tty; if (!ab) return; /* tsk == current */ cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - + tty = audit_get_tty(tsk); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" @@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), - tty, audit_get_sessionid(tsk)); - + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(tsk)); + audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - audit_log_d_path_exe(ab, tsk->mm); audit_log_task_context(ab); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..71e14d836e69 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; if (!audit_enabled) return; @@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(current); ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, oldsessionid, sessionid, !rc); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); audit_log_end(ab); } -- cgit v1.2.3 From 7132e2d669bd42c3783327f301aaac5f4463299b Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 25 Apr 2016 18:56:14 -0300 Subject: ftrace: Match dot symbols when searching functions on ppc64 In the ppc64 big endian ABI, function symbols point to function descriptors. The symbols which point to the function entry points have a dot in front of the function name. Consequently, when the ftrace filter mechanism searches for the symbol corresponding to an entry point address, it gets the dot symbol. As a result, ftrace filter users have to be aware of this ABI detail on ppc64 and prepend a dot to the function name when setting the filter. The perf probe command insulates the user from this by ignoring the dot in front of the symbol name when matching function names to symbols, but the sysfs interface does not. This patch makes the ftrace filter mechanism do the same when searching symbols. Fixes the following failure in ftracetest's kprobe_ftrace.tc: .../kprobe_ftrace.tc: line 9: echo: write error: Invalid argument That failure is on this line of kprobe_ftrace.tc: echo _do_fork > set_ftrace_filter This is because there's no _do_fork entry in the functions list: # cat available_filter_functions | grep _do_fork ._do_fork This change introduces no regressions on the perf and ftracetest testsuite results. Cc: Steven Rostedt Cc: Ingo Molnar Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Thiago Jung Bauermann Acked-by: Steven Rostedt Signed-off-by: Michael Ellerman --- kernel/trace/ftrace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e8d792da963..a6c8252d7776 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3456,11 +3456,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) -- cgit v1.2.3 From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 26 Apr 2016 21:22:22 -0400 Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header The functions event_trigger_unlock_commit() and event_trigger_unlock_commit_regs() are no longer used outside the tracing system. Move them out of the generic headers and into the local one. Along with __event_trigger_test_discard() that is only used by them. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 727a3d28bce5..c0eac7b1e5a6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,6 +1065,100 @@ struct trace_subsystem_dir { int nr_events; }; +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) + ring_buffer_discard_commit(buffer, event); + else if (!filter_check_discard(file, entry, buffer, event)) + return false; + + return true; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -- cgit v1.2.3 From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Apr 2016 10:13:46 -0400 Subject: tracing: Make filter_check_discard() local Nothing outside of the tracing directory calls filter_check_discard() or check_filter_check_discard(). They should not be called by modules. Move their prototypes into the local tracing header and remove their EXPORT_SYMBOL() macros. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46028d47d252..02f5a5f51d49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec, return 0; } -EXPORT_SYMBOL_GPL(filter_check_discard); int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, @@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } -EXPORT_SYMBOL_GPL(call_filter_check_discard); static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c0eac7b1e5a6..ee8691c66bfe 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,6 +1065,12 @@ struct trace_subsystem_dir { int nr_events; }; +extern int filter_check_discard(struct trace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From 9cbb1506ab2db987c160e7fc50665bf47b5b6fa1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Apr 2016 11:09:42 -0400 Subject: tracing: Fold filter_check_discard() into its only user The function filter_check_discard() is small and only called by one user, its code can be folded into that one caller and make the code a bit less comlplex. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 ------------- kernel/trace/trace.h | 13 ++++++------- 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02f5a5f51d49..1ba54e241c8d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -306,19 +306,6 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} - int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee8691c66bfe..0862e7559548 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,9 +1065,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); @@ -1096,12 +1093,14 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, entry))) { ring_buffer_discard_commit(buffer, event); - else if (!filter_check_discard(file, entry, buffer, event)) - return false; + return true; + } - return true; + return false; } /** -- cgit v1.2.3 From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 Apr 2016 12:04:13 -0400 Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header The functions trace_buffer_unlock_commit() and the _regs() version are only used within the kernel/trace directory. Move them to the local header and remove the export as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ba54e241c8d..94e7e4d11b79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0862e7559548..bd5ae56dec7a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1068,6 +1068,16 @@ struct trace_subsystem_dir { extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 16:12:39 -0400 Subject: tracing: Remove unused function trace_current_buffer_discard_commit() The function trace_current_buffer_discard_commit() has no callers, remove it. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 94e7e4d11b79..e5bdb9accf52 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, @@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - ring_buffer_discard_commit(buffer, event); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); - void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, -- cgit v1.2.3 From 33fddff24d05d71f97722cb7deec4964d39d10dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 17:44:01 -0400 Subject: tracing: Have trace_buffer_unlock_commit() call the _regs version with NULL There's no real difference between trace_buffer_unlock_commit() and trace_buffer_unlock_commit_regs() except that the former passes NULL to ftrace_stack_trace() instead of regs. Have the former be a static inline of the latter which passes NULL for regs. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 ----------- kernel/trace/trace.h | 13 +++++++++---- 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5bdb9accf52..41bf14412666 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1686,17 +1686,6 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve ring_buffer_unlock_commit(buffer, event); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __buffer_unlock_commit(buffer, event); - - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); - ftrace_trace_userstack(buffer, flags, pc); -} - static struct ring_buffer *temp_buffer; struct ring_buffer_event * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bd5ae56dec7a..10156a09103f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1069,15 +1069,20 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs); + +static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); +} + /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From 9b9db275051cd9191e7776c4fd79ccd4318aa2dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 18:10:21 -0400 Subject: tracing: Remove one use of trace_current_buffer_lock_reserve() The only user of trace_current_buffer_lock_reserve() is in the boot up self tests. Restructure the code a little to have that code use what everything else uses: trace_event_buffer_lock_reserve(). Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7cb983ee93c..da1eeb6190e3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3392,7 +3392,7 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static struct trace_array *event_tr; +static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, @@ -3416,17 +3416,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, local_save_flags(flags); - event = trace_current_buffer_lock_reserve(&buffer, - TRACE_FN, sizeof(*entry), - flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); - + event_trigger_unlock_commit(&event_trace_file, buffer, event, + entry, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); @@ -3441,9 +3441,11 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; - event_tr = top_trace_array(); - if (WARN_ON(!event_tr)) + + event_trace_file.tr = top_trace_array(); + if (WARN_ON(!event_trace_file.tr)) return; + ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); -- cgit v1.2.3 From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 18:11:54 -0400 Subject: tracing: Remove unused function trace_current_buffer_lock_reserve() trace_current_buffer_lock_reserve() has no more users. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 41bf14412666..c09e8ffadc73 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, -- cgit v1.2.3 From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 2 May 2016 21:30:04 -0400 Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 71 ++++++-------------------------------- 1 file changed, 10 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b3f5051cd4e9..d1d27bf37a19 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps, static inline struct event_filter *event_filter(struct trace_event_file *file) { - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - return file->event_call->filter; - else - return file->filter; + return file->filter; } /* caller must hold event_mutex */ @@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_FILTERED; - else - file->flags &= ~EVENT_FILE_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; } static void __free_filter(struct event_filter *filter) @@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) static inline void __remove_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - filter_disable(file); - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - remove_filter_string(call->filter); - else - remove_filter_string(file->filter); + remove_filter_string(file->filter); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { - __free_filter(call->filter); - call->filter = NULL; - } else { - __free_filter(file->filter); - file->filter = NULL; - } + __free_filter(file->filter); + file->filter = NULL; } static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, @@ -1718,69 +1698,38 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - file->flags |= EVENT_FILE_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; } static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - rcu_assign_pointer(call->filter, filter); - else - rcu_assign_pointer(file->filter, filter); + rcu_assign_pointer(file->filter, filter); } static inline void event_clear_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - RCU_INIT_POINTER(call->filter, NULL); - else - RCU_INIT_POINTER(file->filter, NULL); + RCU_INIT_POINTER(file->filter, NULL); } static inline void event_set_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool event_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; - if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && - (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) - return true; - return false; } -- cgit v1.2.3 From 0fc1b09ff1ff404ddf753f5ffa5cd0adc8fdcdc9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 May 2016 17:15:43 -0400 Subject: tracing: Use temp buffer when filtering events Filtering of events requires the data to be written to the ring buffer before it can be decided to filter or not. This is because the parameters of the filter are based on the result that is written to the ring buffer and not on the parameters that are passed into the trace functions. The ftrace ring buffer is optimized for writing into the ring buffer and committing. The discard procedure used when filtering decides the event should be discarded is much more heavy weight. Thus, using a temporary filter when filtering events can speed things up drastically. Without a temp buffer we have: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.790706626 seconds time elapsed ( +- 0.71% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.566904059 seconds time elapsed ( +- 0.27% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.690598511 seconds time elapsed ( +- 0.19% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.707486364 seconds time elapsed ( +- 0.30% ) The first run above is without any tracing, just to get a based figure. hackbench takes ~0.79 seconds to run on the system. The second run enables tracing all events where nothing is filtered. This increases the time by 100% and hackbench takes 1.57 seconds to run. The third run filters all events where the preempt count will equal "20" (this should never happen) thus all events are discarded. This takes 1.69 seconds to run. This is 10% slower than just committing the events! The last run enables all events and filters where the filter will commit all events, and this takes 1.70 seconds to run. The filtering overhead is approximately 10%. Thus, the discard and commit of an event from the ring buffer may be about the same time. With this patch, the numbers change: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.778233033 seconds time elapsed ( +- 0.38% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.582102692 seconds time elapsed ( +- 0.28% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.309230710 seconds time elapsed ( +- 0.22% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.786001924 seconds time elapsed ( +- 0.20% ) The first run is again the base with no tracing. The second run is all tracing with no filtering. It is a little slower, but that may be well within the noise. The third run shows that discarding all events only took 1.3 seconds. This is a speed up of 23%! The discard is much faster than even the commit. The one downside is shown in the last run. Events that are not discarded by the filter will take longer to add, this is due to the extra copy of the event. Cc: Alexei Starovoitov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 154 +++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 19 ++++- kernel/trace/trace_events.c | 10 +++ kernel/trace/trace_events_filter.c | 10 +++ 4 files changed, 185 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c09e8ffadc73..8a4bd6b68a0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -312,7 +312,7 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); + __trace_event_discard_commit(buffer, event); return 1; } @@ -1660,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, @@ -1669,21 +1679,136 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) +{ + struct ring_buffer_event *event; + struct page *page; + int cpu; - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (trace_buffered_event_ref++) + return; + + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + goto failed; + + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + preempt_disable(); + if (cpu == smp_processor_id() && + this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + preempt_enable(); } - return event; + return; + failed: + trace_buffered_event_disable(); +} + +static void enable_trace_buffered_event(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); + this_cpu_dec(trace_buffered_event_cnt); +} + +static void disable_trace_buffered_event(void *data) +{ + this_cpu_inc(trace_buffered_event_cnt); +} + +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) + return; + + if (--trace_buffered_event_ref) + return; + + preempt_disable(); + /* For each CPU, set the buffer as used. */ + smp_call_function_many(tracing_buffer_mask, + disable_trace_buffered_event, NULL, 1); + preempt_enable(); + + /* Wait for all current users to finish */ + synchronize_sched(); + + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } + /* + * Make sure trace_buffered_event is NULL before clearing + * trace_buffered_event_cnt. + */ + smp_wmb(); + + preempt_disable(); + /* Do the work on each cpu */ + smp_call_function_many(tracing_buffer_mask, + enable_trace_buffered_event, NULL, 1); + preempt_enable(); } void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); - ring_buffer_unlock_commit(buffer, event); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); } static struct ring_buffer *temp_buffer; @@ -1695,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, unsigned long flags, int pc) { struct ring_buffer_event *entry; + int val; *current_rb = trace_file->tr->trace_buffer.buffer; + + if ((trace_file->flags & + (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + (entry = this_cpu_read(trace_buffered_event))) { + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if (val == 1) { + trace_event_setup(entry, type, flags, pc); + entry->array[0] = len; + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10156a09103f..5167c366d6b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1083,6 +1083,23 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); } +DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DECLARE_PER_CPU(int, trace_buffered_event_cnt); +void trace_buffered_event_disable(void); +void trace_buffered_event_enable(void); + +static inline void +__trace_event_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (this_cpu_read(trace_buffered_event) == event) { + /* Simply release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + return; + } + ring_buffer_discard_commit(buffer, event); +} + /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires @@ -1111,7 +1128,7 @@ __event_trigger_test_discard(struct trace_event_file *file, if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, entry))) { - ring_buffer_discard_commit(buffer, event); + __trace_event_discard_commit(buffer, event); return true; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index da1eeb6190e3..4d006707b947 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -363,6 +363,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -445,6 +446,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } + /* Enable or disable use of trace_buffered_event */ + if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != + (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + trace_buffered_event_enable(); + else + trace_buffered_event_disable(); + } + return ret; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d1d27bf37a19..9daa9b3bc6d9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -823,7 +823,12 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { + unsigned long old_flags = file->flags; + file->flags &= ~EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_disable(); } static void __free_filter(struct event_filter *filter) @@ -1698,7 +1703,12 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { + unsigned long old_flags = file->flags; + file->flags |= EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_enable(); } static inline void event_set_filter(struct trace_event_file *file, -- cgit v1.2.3 From 470bf1f27a1472264d18c84b324389509f0e30b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 24 Mar 2016 02:46:33 +0100 Subject: seccomp: Fix comment typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop accidentally repeated word in comment. Signed-off-by: Mickaël Salaün Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e1e5a354854e..6c9bb62ed046 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -915,7 +915,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, fprog = filter->prog->orig_prog; if (!fprog) { - /* This must be a new non-cBPF filter, since we save every + /* This must be a new non-cBPF filter, since we save * every cBPF filter's orig_prog above when * CONFIG_CHECKPOINT_RESTORE is enabled. */ -- cgit v1.2.3 From cc622420798c4bcf093785d872525087a7798db9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Apr 2016 17:35:29 +0200 Subject: gcov: disable for COMPILE_TEST Enabling gcov is counterproductive to compile testing: it significantly increases the kernel image size, compile time, and it produces lots of false positive "may be used uninitialized" warnings as the result of missed optimizations. This is in line with how UBSAN_SANITIZE_ALL and PROFILE_ALL_BRANCHES work, both of which have similar problems. With an ARM allmodconfig kernel, I see the build time drop from 283 minutes CPU time to 225 minutes, and the vmlinux size drops from 43MB to 26MB. Signed-off-by: Arnd Bergmann Acked-by: Peter Oberparleiter Signed-off-by: Michal Marek --- kernel/gcov/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n -- cgit v1.2.3 From c983f0e8677d5d686ea81de13f9c9ac34f21a2a7 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 29 Mar 2016 09:35:32 +0100 Subject: seccomp: Get compat syscalls from asm-generic header Move retrieval of compat syscall numbers into inline function defined in asm-generic header so that arches may override it. [ralf@linux-mips.org: Resolve merge conflict.] Suggested-by: Paul Burton Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Will Drewry Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/12978/ Signed-off-by: Ralf Baechle --- kernel/seccomp.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e1e5a354854e..737436ebb4fe 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -518,19 +518,12 @@ static int mode1_syscalls[] = { 0, /* null terminated */ }; -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = mode1_syscalls_32; + syscall_whitelist = get_compat_mode1_syscalls(); #endif do { if (*syscall_whitelist == this_syscall) -- cgit v1.2.3 From cb4253aa0f77f20be018970dbe5d01d78b930ef9 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 29 Mar 2016 09:35:34 +0100 Subject: secomp: Constify mode1 syscall whitelist These values are constant and should be marked as such. Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: Will Drewry Cc: Andy Lutomirski Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/12979/ Signed-off-by: Ralf Baechle --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 737436ebb4fe..a0ffcb1a2bee 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -513,14 +513,14 @@ static void seccomp_send_sigsys(int syscall, int reason) * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ -static int mode1_syscalls[] = { +static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 0, /* null terminated */ }; static void __secure_computing_strict(int this_syscall) { - int *syscall_whitelist = mode1_syscalls; + const int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) syscall_whitelist = get_compat_mode1_syscalls(); -- cgit v1.2.3 From a831100aeefbe6d9f3e47a3e2712f82c042f1f5c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 May 2016 16:34:53 -0300 Subject: perf core: Generalize max_stack sysctl handler So that it can be used for other stack related knobs, such as the upcoming one to tweak the max number of of contexts per stack sample. In all those cases we can only change the value if there are no perf sessions collecting stacks, so they need to grab that mutex, etc. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-8t3fk94wuzp8m2z1n4gc0s17@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 5 +++-- kernel/sysctl.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9325e7dcba1..7fc89939ede9 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -228,7 +228,8 @@ exit_put: int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int new_value = sysctl_perf_event_max_stack, ret; + int *value = table->data; + int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; @@ -240,7 +241,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write, if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else - sysctl_perf_event_max_stack = new_value; + *value = new_value; mutex_unlock(&callchain_mutex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..0ec6907a16b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1149,7 +1149,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "perf_event_max_stack", - .data = NULL, /* filled in by handler */ + .data = &sysctl_perf_event_max_stack, .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, -- cgit v1.2.3 From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 28 Apr 2016 12:30:53 -0300 Subject: perf core: Pass max stack as a perf_callchain_entry context This makes perf_callchain_{user,kernel}() receive the max stack as context for the perf_callchain_entry, instead of accessing the global sysctl_perf_event_max_stack. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/bpf/stackmap.c | 3 ++- kernel/events/callchain.c | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f5a19548be12..a82d7605db3f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 7fc89939ede9..af95ad92893a 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, +__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } -__weak void perf_callchain_user(struct perf_callchain_entry *entry, +__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; + struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); @@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) goto exit_put; + ctx.entry = entry; + ctx.max_stack = max_stack; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); + perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(&ctx, regs); } if (user) { @@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + perf_callchain_store(&ctx, PERF_CONTEXT_USER); + perf_callchain_user(&ctx, regs); } } -- cgit v1.2.3 From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 May 2016 18:08:32 -0300 Subject: perf core: Add a 'nr' field to perf_event_callchain_context We will use it to count how many addresses are in the entry->ip[] array, excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really return the number of entries specified by the user via the relevant sysctl, kernel.perf_event_max_contexts, or via the per event perf_event_attr.sample_max_stack knob. This way we keep the perf_sample->ip_callchain->nr meaning, that is the number of entries, be it real addresses or PERF_CONTEXT_ entries, while honouring the max_stack knobs, i.e. the end result will be max_stack entries if we have at least that many entries in a given stack trace. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index af95ad92893a..8774ff86debb 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.entry = entry; ctx.max_stack = max_stack; - - entry->nr = init_nr; + ctx.nr = entry->nr = init_nr; if (kernel && !user_mode(regs)) { if (add_mark) -- cgit v1.2.3 From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 May 2016 13:01:50 -0300 Subject: perf core: Add perf_callchain_store_context() helper We need have different helpers to account how many contexts we have in the sample and for real addresses, so do it now as a prep patch, to ease review. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8774ff86debb..ca645736a983 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } @@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(&ctx, PERF_CONTEXT_USER); + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); perf_callchain_user(&ctx, regs); } } -- cgit v1.2.3 From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 May 2016 13:06:21 -0300 Subject: perf core: Separate accounting of contexts and real addresses in a stack trace The perf_sample->ip_callchain->nr value includes all the entries in the ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc}, while what the user expects is that what is in the kernel.perf_event_max_stack sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be honoured in terms of IP addresses in the stack trace. So allocate a bunch of extra entries for contexts, and do the accounting via perf_callchain_entry_ctx struct members. A new sysctl, kernel.perf_event_max_contexts_per_stack is also introduced for investigating possible bugs in the callchain implementation by some arch. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 10 +++++++++- kernel/sysctl.c | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index ca645736a983..179ef4640964 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -19,11 +19,13 @@ struct callchain_cpus_entries { }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; +int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + - sizeof(__u64) * sysctl_perf_event_max_stack); + sizeof(__u64) * (sysctl_perf_event_max_stack + + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); @@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -228,6 +232,10 @@ exit_put: return entry; } +/* + * Used for sysctl_perf_event_max_stack and + * sysctl_perf_event_max_contexts_per_stack. + */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ec6907a16b3..bec4c11c47d6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &six_hundred_forty_kb, }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &one_thousand, + }, #endif #ifdef CONFIG_KMEMCHECK { -- cgit v1.2.3 From 60f05e86cf3e8c5f379fe5ba94634fcec17dd67e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 18 May 2016 17:55:28 +0530 Subject: cpufreq: schedutil: Improve prints messages with pr_fmt Prefix print messages with KBUILD_MODNAME, i.e 'cpufreq_schedutil: '. This helps to keep similar formatting for all the print messages particular to a file and identify those easily in kernel logs. Its already done this way for rest of the governors. Along with that, remove the (now) redundant bits from a print message. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 154ae3a51e86..14c4aa25cc45 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -9,6 +9,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); - pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret); + pr_err("initialization failed (error %d)\n", ret); return ret; } -- cgit v1.2.3 From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:02 -0700 Subject: time: add missing implementation for timespec64_add_safe() timespec64_add_safe() has been defined in time64.h for 64 bit systems. But, 32 bit systems only have an extern function prototype defined. Provide a definition for the above function. The function will be necessary as part of y2038 changes. struct timespec is not y2038 safe. All references to timespec will be replaced by struct timespec64. The function is meant to be a replacement for timespec_add_safe(). The implementation is similar to timespec_add_safe(). Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/time.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index a4064b612066..cb1f83eb5599 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +#if __BITS_PER_LONG != 64 + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +#endif -- cgit v1.2.3 From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:08 -0700 Subject: time: remove timespec_add_safe() All references to timespec_add_safe() now use timespec64_add_safe(). The plan is to replace struct timespec references with struct timespec64 throughout the kernel as timespec is not y2038 safe. Drop timespec_add_safe() and use timespec64_add_safe() for all architectures. Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/time.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index cb1f83eb5599..667b9335f5d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } -#if __BITS_PER_LONG != 64 - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). @@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } - -#endif -- cgit v1.2.3 From 02a982a6ec631d871571f940ca13817551759884 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:26 -0700 Subject: workqueue: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to change (debugobjects: make fixup functions return bool instead of int) Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f5068e94003..6751b18fd9ac 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -437,7 +437,7 @@ static void *work_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int work_fixup_init(void *addr, enum debug_obj_state state) +static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -445,9 +445,9 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -456,7 +456,7 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) +static bool work_fixup_activate(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -471,16 +471,16 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); - return 0; + return false; } WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -488,7 +488,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int work_fixup_free(void *addr, enum debug_obj_state state) +static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -496,9 +496,9 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } -- cgit v1.2.3 From e3252464da222ef2c2ca52dff383a824080ea3d5 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:29 -0700 Subject: timer: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/hrtimer.c | 18 +++++++++--------- kernel/time/timer.c | 30 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa0b983290cf..f962a58c0957 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -342,9 +342,9 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -353,19 +353,19 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_NOTAVAILABLE: WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -373,7 +373,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -381,9 +381,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 73164c3aa56b..be33481a4da1 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -493,7 +493,7 @@ static void *timer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -501,9 +501,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -518,7 +518,7 @@ static void stub_timer(unsigned long data) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -534,18 +534,18 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -553,7 +553,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) +static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -561,9 +561,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -571,7 +571,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -584,13 +584,13 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) * is tracked in the object tracker. */ debug_object_init(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } default: - return 0; + return false; } } -- cgit v1.2.3 From 3263d28eb5b93b3c1b024366df87b1d0c774228f Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:32 -0700 Subject: rcu: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/update.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ccdc8eebc5a..a9df198eb22d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -386,7 +386,7 @@ void destroy_rcu_head(struct rcu_head *head) * - an unknown object is activated (might be a statically initialized object) * Activation is performed internally by call_rcu(). */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) { struct rcu_head *head = addr; @@ -399,9 +399,9 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) */ debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); - return 0; + return false; default: - return 1; + return true; } } -- cgit v1.2.3 From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:41 -0700 Subject: debugobjects: insulate non-fixup logic related to static obj from fixup callbacks When activating a static object we need make sure that the object is tracked in the object tracker. If it is a non-static object then the activation is illegal. In previous implementation, each subsystem need take care of this in their fixup callbacks. Actually we can put it into debugobjects core. Thus we can save duplicated code, and have *pure* fixup callbacks. To achieve this, a new callback "is_static_object" is introduced to let the type specific code decide whether a object is static or not. If yes, we take it into object tracker, otherwise give warning and invoke fixup callback. This change has paassed debugobjects selftest, and I also do some test with all debugobjects supports enabled. At last, I have a concern about the fixups that can it change the object which is in incorrect state on fixup? Because the 'addr' may not point to any valid object if a non-static object is not tracked. Then Change such object can overwrite someone's memory and cause unexpected behaviour. For example, the timer_fixup_activate bind timer to function stub_timer. Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com [changbin.du@intel.com: improve code comments where invoke the new is_static_object callback] Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/update.c | 26 +++----------------------- kernel/time/hrtimer.c | 7 +------ kernel/time/timer.c | 43 ++++++++++++++----------------------------- kernel/workqueue.c | 42 ++++++++---------------------------------- 4 files changed, 26 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a9df198eb22d..3e888cd5a594 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return false; - default: - return true; - } + return true; } /** @@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f962a58c0957..8c7392c4fdbd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return false; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be33481a4da1..3a95f9728778 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized @@ -516,30 +524,16 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } - return false; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); @@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } + setup_timer(timer, stub_timer, 0); + return true; default: return false; } @@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6751b18fd9ac..e1c0e996b5ae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } +static bool work_is_static_object(void *addr) +{ + struct work_struct *work = addr; + + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); +} + /* * fixup_init is called when: * - an active object is initialized @@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state) } } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static bool work_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return false; - } - WARN_ON_ONCE(1); - return false; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return false; - } -} - /* * fixup_free is called when: * - an active object is freed @@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; -- cgit v1.2.3 From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Thu, 19 May 2016 17:09:56 -0700 Subject: kernel/padata.c: removed unused code By accident I stumbled across code that has never been used. This driver has EXPORT_SYMBOL functions, and the only user of the code is pcrypt.c, but this only uses a subset of the exported symbols. According to 'git log -G', the functions, padata_set_cpumasks, padata_add_cpu, and padata_remove_cpu have never been used since they were first introduced. This patch removes the unused code. On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than 4k smaller. kbuild_hp> size $KBUILD_OUTPUT/vmlinux text data bss dec hex filename 10566658 4678360 1122304 16367322 f9beda vmlinux 10561984 4678360 1122304 16362648 f9ac98 vmlinux On another config, 32 bit, the saving is about 0.5k bytes. kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux 6012005 2409513 2785280 11206798 ab008e vmlinux 6011491 2409513 2785280 11206284 aafe8c vmlinux Signed-off-by: Richard Cochran Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/padata.c | 64 --------------------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9c466a..67ddd4acde9d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -606,33 +606,6 @@ out_replace: return 0; } -/** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - /** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. @@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v1.2.3 From 19d795b677bda354644cfb87a196b087fdc2a965 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 May 2016 17:09:59 -0700 Subject: kernel/padata.c: hide unused functions A recent cleanup removed some exported functions that were not used anywhere, which in turn exposed the fact that some other functions in the same file are only used in some configurations. We now get a warning about them when CONFIG_HOTPLUG_CPU is disabled: kernel/padata.c:670:12: error: '__padata_remove_cpu' defined but not used [-Werror=unused-function] static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) ^~~~~~~~~~~~~~~~~~~ kernel/padata.c:650:12: error: '__padata_add_cpu' defined but not used [-Werror=unused-function] static int __padata_add_cpu(struct padata_instance *pinst, int cpu) This rearranges the code so the __padata_remove_cpu/__padata_add_cpu functions are within the #ifdef that protects the code that calls them. [akpm@linux-foundation.org: coding-style fixes] Fixes: 4ba6d78c671e ("kernel/padata.c: removed unused code") Signed-off-by: Arnd Bergmann Cc: Richard Cochran Cc: Steffen Klassert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/padata.c | 74 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 67ddd4acde9d..993278895ccc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -647,6 +647,43 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err = -EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; @@ -726,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) } EXPORT_SYMBOL(padata_remove_cpu); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || -- cgit v1.2.3 From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:49 -0700 Subject: mm: rename _count, field of the struct page, to _refcount Many developers already know that field for reference count of the struct page is _count and atomic type. They would try to handle it directly and this could break the purpose of page reference count tracepoint. To prevent direct _count modification, this patch rename it to _refcount and add warning message on the code. After that, developer who need to handle reference count will find that field should not be accessed directly. [akpm@linux-foundation.org: fix comments, per Vlastimil] [akpm@linux-foundation.org: Documentation/vm/transhuge.txt too] [sfr@canb.auug.org.au: sync ethernet driver changes] Signed-off-by: Joonsoo Kim Signed-off-by: Stephen Rothwell Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Cc: Manish Chopra Cc: Yuval Mintz Cc: Tariq Toukan Cc: Saeed Mahameed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3ee3b86..1c03dfb4abfd 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); -- cgit v1.2.3 From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:10:58 -0700 Subject: include/linux/nodemask.h: create next_node_in() helper Lots of code does node = next_node(node, XXX); if (node == MAX_NUMNODES) node = first_node(XXX); so create next_node_in() to do this and use it in various places. [mhocko@suse.com: use next_node_in() helper] Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Michal Hocko Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956baba1..611cc69af8f0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) -- cgit v1.2.3 From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:50 -0700 Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { -- cgit v1.2.3 From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:30 -0700 Subject: cpuset: use static key better and convert to new API An important function for cpusets is cpuset_node_allowed(), which optimizes on the fact if there's a single root CPU set, it must be trivially allowed. But the check "nr_cpusets() <= 1" doesn't use the cpusets_enabled_key static key the right way where static keys eliminate branching overhead with jump labels. This patch converts it so that static key is used properly. It's also switched to the new static key API and the checking functions are converted to return bool instead of int. We also provide a new variant __cpuset_zone_allowed() which expects that the static key check was already done and they key was enabled. This is needed for get_page_from_freelist() where we want to also avoid the relatively slower check when ALLOC_CPUSET is not set in alloc_flags. The impact on the page allocator microbenchmark is less than expected but the cleanup in itself is worthwhile. 4.6.0-rc2 4.6.0-rc2 multcheck-v1r20 cpuset-v1r20 Min alloc-odr0-1 348.00 ( 0.00%) 348.00 ( 0.00%) Min alloc-odr0-2 254.00 ( 0.00%) 254.00 ( 0.00%) Min alloc-odr0-4 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-16 173.00 ( 0.00%) 171.00 ( 1.16%) Min alloc-odr0-32 166.00 ( 0.00%) 163.00 ( 1.81%) Min alloc-odr0-64 162.00 ( 0.00%) 159.00 ( 1.85%) Min alloc-odr0-128 160.00 ( 0.00%) 157.00 ( 1.88%) Min alloc-odr0-256 169.00 ( 0.00%) 166.00 ( 1.78%) Min alloc-odr0-512 180.00 ( 0.00%) 180.00 ( 0.00%) Min alloc-odr0-1024 188.00 ( 0.00%) 187.00 ( 0.53%) Min alloc-odr0-2048 194.00 ( 0.00%) 193.00 ( 0.52%) Min alloc-odr0-4096 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-8192 202.00 ( 0.00%) 201.00 ( 0.50%) Min alloc-odr0-16384 203.00 ( 0.00%) 202.00 ( 0.49%) Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Acked-by: Zefan Li Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 611cc69af8f0..73e93e53884d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include #include -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); -- cgit v1.2.3 From 6112a300c9e41993cc0dc56ac393743d28381284 Mon Sep 17 00:00:00 2001 From: Soumya PN Date: Tue, 17 May 2016 21:31:14 +0530 Subject: ftrace: Don't disable irqs when taking the tasklist_lock read_lock In ftrace.c inside the function alloc_retstack_tasklist() (which will be invoked when function_graph tracing is on) the tasklist_lock is being held as reader while iterating through a list of threads. Here the lock is being held as reader with irqs disabled. The tasklist_lock is never write_locked in interrupt context so it is safe to not disable interrupts for the duration of read_lock in this block which, can be significant, given the block of code iterates through all threads. Hence changing the code to call read_lock() and read_unlock() instead of read_lock_irqsave() and read_unlock_irqrestore(). A similar change was made in commits: 8063e41d2ffc ("tracing: Change syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread()")' and 3472eaa1f12e ("sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock()")' Link: http://lkml.kernel.org/r/1463500874-77480-1-git-send-email-soumya.p.n@hpe.com Signed-off-by: Soumya PN Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1870fbd2b67..a6804823a058 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5713,7 +5713,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) { int i; int ret = 0; - unsigned long flags; int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; @@ -5729,7 +5728,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); do_each_thread(g, t) { if (start == end) { ret = -EAGAIN; @@ -5747,7 +5746,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } while_each_thread(g, t); unlock: - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); -- cgit v1.2.3 From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 May 2016 14:14:28 +0200 Subject: bpf: rather use get_random_int for randomizations Start address randomization and blinding in BPF currently use prandom_u32(). prandom_u32() values are not exposed to unpriviledged user space to my knowledge, but given other kernel facilities such as ASLR, stack canaries, etc make use of stronger get_random_int(), we better make use of it here as well given blinding requests successively new random values. get_random_int() has minimal entropy pool depletion, is not cryptographically secure, but doesn't need to be for our use cases here. Suggested-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f1e8a0def99b..b94a36550591 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->pages = size / PAGE_SIZE; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (prandom_u32() % hole) & ~(alignment - 1); + start = (get_random_int() % hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, struct bpf_insn *to_buff) { struct bpf_insn *to = to_buff; - u32 imm_rnd = prandom_u32(); + u32 imm_rnd = get_random_int(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); -- cgit v1.2.3 From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 May 2016 17:22:48 -0500 Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the bpf filesystem. Looking at the code I saw a broken usage of mount_ns with current->nsproxy->mnt_ns. As the code does not acquire a reference to the mount namespace it can not possibly be correct to store the mount namespace on the superblock as it does. Replace mount_ns with mount_nodev so that each mount of the bpf filesystem returns a distinct instance, and the code is not buggy. In discussion with Hannes Frederic Sowa it was reported that the use of mount_ns was an attempt to have one bpf instance per mount namespace, in an attempt to keep resources that pin resources from hiding. That intent simply does not work, the vfs is not built to allow that kind of behavior. Which means that the bpf filesystem really is buggy both semantically and in it's implemenation as it does not nor can it implement the original intent. This change is userspace visible, but my experience with similar filesystems leads me to believe nothing will break with a model of each mount of the bpf filesystem is distinct from all others. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Cc: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: "Eric W. Biederman" Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 71b75d9c81da..04be7021f848 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { -- cgit v1.2.3 From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:13 -0700 Subject: bpf: support decreasing order in direct packet access when packet headers are accessed in 'decreasing' order (like TCP port may be fetched before the program reads IP src) the llvm may generate the following code: [...] // R7=pkt(id=0,off=22,r=70) r2 = *(u32 *)(r7 +0) // good access [...] r7 += 40 // R7=pkt(id=0,off=62,r=70) r8 = *(u32 *)(r7 +0) // good access [...] r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range // it's doing *(u32*)(skb->data + 42) Fix verifier to recognize such code pattern Alos turned out that 'off > range' condition is not a verifier bug. It's a buggy program that may do something like: if (ptr + 50 > data_end) return 0; ptr += 60; *(u32*)ptr; in such case emit "invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message, so all information is available for the program author to fix the program. Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a08d66215245..d54e34874579 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, { struct reg_state *regs = env->cur_state.regs; struct reg_state *reg = ®s[regno]; - int linear_size = (int) reg->range - (int) reg->off; - if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) { - verbose("verifier bug\n"); - return -EFAULT; - } - if (off < 0 || off + size > linear_size) { - verbose("invalid access to packet, off=%d size=%d, allowed=%d\n", - off, size, linear_size); + off += reg->off; + if (off < 0 || off + size > reg->range) { + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } return 0; -- cgit v1.2.3 From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:14 -0700 Subject: bpf: teach verifier to recognize imm += ptr pattern Humans don't write C code like: u8 *ptr = skb->data; int imm = 4; imm += ptr; but from llvm backend point of view 'imm' and 'ptr' are registers and imm += ptr may be preferred vs ptr += imm depending which register value will be used further in the code, while verifier can only recognize ptr += imm. That caused small unrelated changes in the C code of the bpf program to trigger rejection by the verifier. Therefore teach the verifier to recognize both ptr += imm and imm += ptr. For example: when R6=pkt(id=0,off=0,r=62) R7=imm22 after r7 += r6 instruction will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62) Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d54e34874579..668e07903c8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *regs = env->cur_state.regs; struct reg_state *dst_reg = ®s[insn->dst_reg]; struct reg_state *src_reg = ®s[insn->src_reg]; + struct reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1267,6 +1268,19 @@ add_imm: */ dst_reg->off += imm; } else { + if (src_reg->type == PTR_TO_PACKET) { + /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ + tmp_reg = *dst_reg; /* save r7 state */ + *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ + src_reg = &tmp_reg; /* pretend it's src_reg state */ + /* if the checks below reject it, the copy won't matter, + * since we're rejecting the whole program. If all ok, + * then imm22 state will be added to r7 + * and r7 will be pkt(id=0,off=22,r=62) while + * r6 will stay as pkt(id=0,off=0,r=62) + */ + } + if (src_reg->type == CONST_IMM) { /* pkt_ptr += reg where reg is known constant */ imm = src_reg->imm; @@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == PTR_TO_PACKET) { + (dst_reg->type == PTR_TO_PACKET || + (BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == PTR_TO_PACKET))) { /* ptr_to_packet += K|X */ return check_packet_ptr_add(env, insn); } else if (BPF_CLASS(insn->code) == BPF_ALU64 && -- cgit v1.2.3 From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper context Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3e8451527cbe..8fbed7194af1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); /** * set_mm_exe_file - change a reference to the mm's executable file -- cgit v1.2.3 From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:20 -0700 Subject: exit_thread: accept a task parameter to be exited We need to call exit_thread from copy_process in a fail path. So make it accept task_struct as a parameter. [v2] * s390: exit_thread_runtime_instr doesn't make sense to be called for non-current tasks. * arm: fix the comment in vfp_thread_copy * change 'me' to 'tsk' for task_struct * now we can change only archs that actually have exit_thread [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fd90195667e1..75b34fe835b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,7 +746,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent -- cgit v1.2.3 From 0740aa5f6375681c57488c4ea55d05a0341cfc9c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:25 -0700 Subject: fork: free thread in copy_process on failure When using this program (as root): #include #include #include #include #include #include #include #define ITER 1000 #define FORKERS 15 #define THREADS (6000/FORKERS) // 1850 is proc max static void fork_100_wait() { unsigned a, to_wait = 0; printf("\t%d forking %d\n", THREADS, getpid()); for (a = 0; a < THREADS; a++) { switch (fork()) { case 0: usleep(1000); exit(0); break; case -1: break; default: to_wait++; break; } } printf("\t%d forked from %d, waiting for %d\n", THREADS, getpid(), to_wait); for (a = 0; a < to_wait; a++) wait(NULL); printf("\t%d waited from %d\n", THREADS, getpid()); } static void run_forkers() { pid_t forkers[FORKERS]; unsigned a; for (a = 0; a < FORKERS; a++) { switch ((forkers[a] = fork())) { case 0: fork_100_wait(); exit(0); break; case -1: err(1, "DIE fork of %d'th forker", a); break; default: break; } } for (a = 0; a < FORKERS; a++) waitpid(forkers[a], NULL, 0); } int main() { unsigned a; int ret; ret = ioperm(10, 20, 0); if (ret < 0) err(1, "ioperm"); for (a = 0; a < ITER; a++) run_forkers(); return 0; } kmemleak reports many occurences of this leak: unreferenced object 0xffff8805917c8000 (size 8192): comm "fork-leak", pid 2932, jiffies 4295354292 (age 1871.028s) hex dump (first 32 bytes): ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ backtrace: [] kmemdup+0x25/0x50 [] copy_thread_tls+0x6c3/0x9a0 [] copy_process+0x1a84/0x5790 [] wake_up_new_task+0x2d5/0x6f0 [] _do_fork+0x12d/0x820 ... Due to the leakage of the memory items which should have been freed in arch/x86/kernel/process.c:exit_thread(). Make sure the memory is freed when fork fails later in copy_process. This is done by calling exit_thread with the thread to kill. Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8fbed7194af1..103d78fd8f75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1490,7 +1490,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); - goto bad_fork_cleanup_io; + goto bad_fork_cleanup_thread; } } @@ -1652,6 +1652,8 @@ bad_fork_cancel_cgroup: bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_thread: + exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); -- cgit v1.2.3 From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:33 -0700 Subject: printk/nmi: generic solution for safe printk in NMI printk() takes some locks and could not be used a safe way in NMI context. The chance of a deadlock is real especially when printing stacks from all CPUs. This particular problem has been addressed on x86 by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). The patchset brings two big advantages. First, it makes the NMI backtraces safe on all architectures for free. Second, it makes all NMI messages almost safe on all architectures (the temporary buffer is limited. We still should keep the number of messages in NMI context at minimum). Note that there already are several messages printed in NMI context: WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE handlers. These are not easy to avoid. This patch reuses most of the code and makes it generic. It is useful for all messages and architectures that support NMI. The alternative printk_func is set when entering and is reseted when leaving NMI context. It queues IRQ work to copy the messages into the main ring buffer in a safe context. __printk_nmi_flush() copies all available messages and reset the buffer. Then we could use a simple cmpxchg operations to get synchronized with writers. There is also used a spinlock to get synchronized with other flushers. We do not longer use seq_buf because it depends on external lock. It would be hard to make all supported operations safe for a lockless use. It would be confusing and error prone to make only some operations safe. The code is put into separate printk/nmi.c as suggested by Steven Rostedt. It needs a per-CPU buffer and is compiled only on architectures that call nmi_enter(). This is achieved by the new HAVE_NMI Kconfig flag. The are MN10300 and Xtensa architectures. We need to clean up NMI handling there first. Let's do it separately. The patch is heavily based on the draft from Peter Zijlstra, see https://lkml.org/lkml/2015/6/10/327 [arnd@arndb.de: printk-nmi: use %zu format string for size_t] [akpm@linux-foundation.org: min_t->min - all types are size_t here] Signed-off-by: Petr Mladek Suggested-by: Peter Zijlstra Suggested-by: Steven Rostedt Cc: Jan Kara Acked-by: Russell King [arm part] Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 1 + kernel/printk/internal.h | 44 ++++++++++ kernel/printk/nmi.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 19 +--- 4 files changed, 265 insertions(+), 18 deletions(-) create mode 100644 kernel/printk/internal.h create mode 100644 kernel/printk/nmi.c (limited to 'kernel') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..2de99faedfc1 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,44 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..303cf0d15e57 --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,219 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; + +#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) + return 0; + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) + pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", + i, len); + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_nmi_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_nmi_seq_line(s, last_i, size - 1); + pr_cont("\n"); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bfbf284e4218..71eba0607034 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; -- cgit v1.2.3 From b522deabc6f18e4f938d93a84f345f2cbf3347d1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:36 -0700 Subject: printk/nmi: warn when some message has been lost in NMI context We could not resize the temporary buffer in NMI context. Let's warn if a message is lost. This is rather theoretical. printk() should not be used in NMI. The only sensible use is when we want to print backtrace from all CPUs. The current buffer should be enough for this purpose. [akpm@linux-foundation.org: whitespace fixlet] Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/internal.h | 11 +++++++++++ kernel/printk/nmi.c | 5 ++++- kernel/printk/printk.c | 10 ++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2de99faedfc1..341bedccc065 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -34,6 +34,12 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return this_cpu_read(printk_func)(fmt, args); } +extern atomic_t nmi_message_lost; +static inline int get_nmi_message_lost(void) +{ + return atomic_xchg(&nmi_message_lost, 0); +} + #else /* CONFIG_PRINTK_NMI */ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) @@ -41,4 +47,9 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return vprintk_default(fmt, args); } +static inline int get_nmi_message_lost(void) +{ + return 0; +} + #endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 303cf0d15e57..572f94922230 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -39,6 +39,7 @@ */ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; +atomic_t nmi_message_lost; #define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) @@ -64,8 +65,10 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) + if (len >= sizeof(s->buffer)) { + atomic_inc(&nmi_message_lost); return 0; + } /* * Make sure that all old data have been read before the buffer was diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 71eba0607034..e38579d730f4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1617,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + int nmi_message_lost; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static unsigned int logbuf_cpu = UINT_MAX; @@ -1667,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level, strlen(recursion_msg)); } + nmi_message_lost = get_nmi_message_lost(); + if (unlikely(nmi_message_lost)) { + text_len = scnprintf(textbuf, sizeof(textbuf), + "BAD LUCK: lost %d message(s) from NMI context!", + nmi_message_lost); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, textbuf, text_len); + } + /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. -- cgit v1.2.3 From 427934b8714ec130b068d1c9d88f25b24aaede32 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:39 -0700 Subject: printk/nmi: increase the size of NMI buffer and make it configurable Testing has shown that the backtrace sometimes does not fit into the 4kB temporary buffer that is used in NMI context. The warnings are gone when I double the temporary buffer size. This patch doubles the buffer size and makes it configurable. Note that this problem existed even in the x86-specific implementation that was added by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). Nobody noticed it because it did not print any warnings. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 572f94922230..bf08557d7e3d 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -41,7 +41,8 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; atomic_t nmi_message_lost; -#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) +#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - sizeof(struct irq_work)) struct nmi_seq_buf { atomic_t len; /* length of written data */ -- cgit v1.2.3 From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:42 -0700 Subject: printk/nmi: flush NMI messages on the system panic In NMI context, printk() messages are stored into per-CPU buffers to avoid a possible deadlock. They are normally flushed to the main ring buffer via an IRQ work. But the work is never called when the system calls panic() in the very same NMI handler. This patch tries to flush NMI buffers before the crash dump is generated. In this case it does not risk a double release and bails out when the logbuf_lock is already taken. The aim is to get the messages into the main ring buffer when possible. It makes them better accessible in the vmcore. Then the patch tries to flush the buffers second time when other CPUs are down. It might be more aggressive and reset logbuf_lock. The aim is to get the messages available for the consequent kmsg_dump() and console_flush_on_panic() calls. The patch causes vprintk_emit() to be called even in NMI context again. But it is done via printk_deferred() so that the console handling is skipped. Consoles use internal locks and we could not prevent a deadlock easily. They are explicitly called later when the crash dump is not generated, see console_flush_on_panic(). Signed-off-by: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Daniel Thompson Cc: David Miller Cc: Ingo Molnar Cc: Jan Kara Cc: Jiri Kosina Cc: Martin Schwidefsky Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 1 + kernel/panic.c | 6 +++++- kernel/printk/internal.h | 2 ++ kernel/printk/nmi.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/printk/printk.c | 2 +- 5 files changed, 47 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1c03dfb4abfd..d5d408252992 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 535c96510a44..8aa74497cc5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -160,8 +160,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -176,6 +178,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 341bedccc065..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI +extern raw_spinlock_t logbuf_lock; + /* * printk() could not take logbuf_lock in NMI context. Instead, * it temporary stores the strings into a per-CPU buffer. diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bf08557d7e3d..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) { const char *buf = s->buffer + start; - printk("%.*s", (end - start) + 1, buf); + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", (end - start) + 1, buf); + else + printk("%.*s", (end - start) + 1, buf); + } /* @@ -194,6 +204,33 @@ void printk_nmi_flush(void) __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); } +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + void __init printk_nmi_init(void) { int cpu; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e38579d730f4..60cdf6386763 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -245,7 +245,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); -- cgit v1.2.3 From ede9c27749b9b35efdffa4f63a39f819d7913752 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:10 -0700 Subject: kernel/sysctl_binary.c: use generic UUID library UUID library provides uuid_be type and uuid_be_to_bin() function. This substitutes open coded variant by generic library calls. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..6eb99c17dbd8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; + char buf[UUID_STRING_LEN + 1]; + uuid_be uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } + result = -EIO; + if (uuid_be_to_bin(buf, &uuid)) + goto out; if (oldlen > 16) oldlen = 16; result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) + if (copy_to_user(oldval, &uuid, oldlen)) goto out; copied = oldlen; -- cgit v1.2.3 From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:01:33 -0700 Subject: radix-tree: introduce radix_tree_empty Commit e61452365372 ("radix_tree: add support for multi-order entries") left the impression that the support for multiorder radix tree entries was functional. As soon as Ross tried to use it, it became apparent that my testing was completely inadequate, and it didn't even work a little bit for orders that were not a multiple of shift. This series of patches is the result of about 6 weeks of redesign, reimplementation, testing, arguing and hair-pulling. The great news is that the test-suite is now far better than it was. That's reflected in the diffstat for the test-suite alone: 12 files changed, 436 insertions(+), 28 deletions(-) The highlight for users of the tree is that the restriction on the order of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix tree now supports any order between 0 and 64. For those who are interested in how the tree works, patch 9 is probably the most interesting one as it introduces the new machinery for handling sibling entries. I've tried to be fair in attributing authorship to the person who contributed the majority of the code in each patch; Ross has been an invaluable partner in the development of this support and it's fair to say that each of us has code in every commit. I should also express my appreciation of the 0day testing. It prompted me that I was bloating the tinyconfig in an unacceptable way, and it bisected to a commit which contained a rather nasty memory-corruption bug. This patch (of 29): The irqdomain code was checking for 0 or 1 entries, not 0 entries like the comment said they were. Introduce a new helper that will actually check for an empty tree. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/irqdomain.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d65f6f31a5b3..8798b6c9e945 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); -- cgit v1.2.3 From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 17:21:27 -0700 Subject: x86: remove more uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_from_user_inatomic()" is mostly the special cases for the constant size, and it's actually almost never relevant. Most users aren't actually using a constant size anyway, and the few cases that do small constant copies are better off just using __get_user() instead. So get rid of the unnecessary complexity. Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 +-- kernel/futex.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..c01f733ff2e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/futex.c b/kernel/futex.c index c20f06f38ef3..ee25f5ba4aca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; -- cgit v1.2.3 From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 May 2016 23:16:18 +0200 Subject: bpf, inode: disallow userns mounts Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag. The original idea was to have a per mountns instance instead of a single global fs instance, but that didn't work out and we had to switch to mount_nodev() model. The intent of that middle ground was that we avoid users who don't play nice to create endless instances of bpf fs which are difficult to control and discover from an admin point of view, but at the same time it would have allowed us to be more flexible with regard to namespaces. Therefore, since we now did the switch to mount_nodev() as a fix where individual instances are created, we also need to remove userns mount flag along with it to avoid running into mentioned situation. I don't expect any breakage at this early point in time with removing the flag and we can revisit this later should the requirement for this come up with future users. This and commit e27f4a942a0e have been split to facilitate tracking should any of them run into the unlikely case of causing a regression. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 04be7021f848..318858edb1cd 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; MODULE_ALIAS_FS("bpf"); -- cgit v1.2.3 From f43edca7ed08fc02279f2a62015da5cb6aa0ad61 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 23 May 2016 16:22:26 -0700 Subject: ELF/MIPS build fix CONFIG_MIPS32_N32=y but CONFIG_BINFMT_ELF disabled results in the following linker errors: arch/mips/built-in.o: In function `elf_core_dump': binfmt_elfn32.c:(.text+0x23dbc): undefined reference to `elf_core_extra_phdrs' binfmt_elfn32.c:(.text+0x246e4): undefined reference to `elf_core_extra_data_size' binfmt_elfn32.c:(.text+0x248d0): undefined reference to `elf_core_write_extra_phdrs' binfmt_elfn32.c:(.text+0x24ac4): undefined reference to `elf_core_write_extra_data' CONFIG_MIPS32_O32=y but CONFIG_BINFMT_ELF disabled results in the following linker errors: arch/mips/built-in.o: In function `elf_core_dump': binfmt_elfo32.c:(.text+0x28a04): undefined reference to `elf_core_extra_phdrs' binfmt_elfo32.c:(.text+0x29330): undefined reference to `elf_core_extra_data_size' binfmt_elfo32.c:(.text+0x2951c): undefined reference to `elf_core_write_extra_phdrs' binfmt_elfo32.c:(.text+0x29710): undefined reference to `elf_core_write_extra_data' This is because binfmt_elfn32 and binfmt_elfo32 are using symbols from elfcore but for these configurations elfcore will not be built. Fixed by making elfcore selectable by a separate config symbol which unlike the current mechanism can also be used from other directories than kernel/, then having each flavor of ELF that relies on elfcore.o, select it in Kconfig, including CONFIG_MIPS32_N32 and CONFIG_MIPS32_O32 which fixes this issue. Link: http://lkml.kernel.org/r/20160520141705.GA1913@linux-mips.org Signed-off-by: Ralf Baechle Reviewed-by: James Hogan Cc: "Maciej W. Rozycki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f0c40bf49d9f..e2ec54e2b952 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o +obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ -- cgit v1.2.3 From bf959931ddb88c4e4366e96dd22e68fa0db9527c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 May 2016 16:23:50 -0700 Subject: wait/ptrace: assume __WALL if the child is traced The following program (simplified version of generated by syzkaller) #include #include #include #include #include void *thread_func(void *arg) { ptrace(PTRACE_TRACEME, 0,0,0); return 0; } int main(void) { pthread_t thread; if (fork()) return 0; while (getppid() != 1) ; pthread_create(&thread, NULL, thread_func, NULL); pthread_join(thread, NULL); return 0; } creates an unreapable zombie if /sbin/init doesn't use __WALL. This is not a kernel bug, at least in a sense that everything works as expected: debugger should reap a traced sub-thread before it can reap the leader, but without __WALL/__WCLONE do_wait() ignores sub-threads. Unfortunately, it seems that /sbin/init in most (all?) distributions doesn't use it and we have to change the kernel to avoid the problem. Note also that most init's use sys_waitid() which doesn't allow __WALL, so the necessary user-space fix is not that trivial. This patch just adds the "ptrace" check into eligible_child(). To some degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal is mostly ignored when the tracee reports to debugger. Or WSTOPPED, the tracer doesn't need to set this flag to wait for the stopped tracee. This obviously means the user-visible change: __WCLONE and __WALL no longer have any meaning for debugger. And I can only hope that this won't break something, but at least strace/gdb won't suffer. We could make a more conservative change. Say, we can take __WCLONE into account, or !thread_group_leader(). But it would be nice to not complicate these historical/confusing checks. Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Cc: Denys Vlasenko Cc: Jan Kratochvil Cc: "Michael Kerrisk (man-pages)" Cc: Pedro Alves Cc: Roland McGrath Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 75b34fe835b2..44fbe6edd7fe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; -- cgit v1.2.3 From 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 May 2016 16:23:53 -0700 Subject: wait: allow sys_waitid() to accept __WNOTHREAD/__WCLONE/__WALL I see no reason why waitid() can't support other linux-specific flags allowed in sys_wait4(). In particular this change can help if we reconsider the previous change ("wait/ptrace: assume __WALL if the child is traced") which adds the "automagical" __WALL for debugger. Signed-off-by: Oleg Nesterov Cc: Dmitry Vyukov Cc: Denys Vlasenko Cc: Jan Kratochvil Cc: "Michael Kerrisk (man-pages)" Cc: Pedro Alves Cc: Roland McGrath Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 44fbe6edd7fe..9e6e1356e6bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1535,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, enum pid_type type; long ret; - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; -- cgit v1.2.3 From 747800efbe8b98459f48d1d9d742298f8283f8fa Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Mon, 23 May 2016 16:23:59 -0700 Subject: kernel/signal.c: convert printk(KERN_ ...) to pr_(...) Use pr_ instead of printk(KERN_ ). Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ab122a2cee41..96e9bc40667f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig) if (!__ratelimit(&ratelimit_state)) return; - printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } @@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); + pr_info("potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) - printk(KERN_INFO "code at %08lx: ", regs->ip); + pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk(KERN_CONT "%02x ", insn); + pr_cont("%02x ", insn); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); #endif preempt_disable(); show_regs(regs); -- cgit v1.2.3 From 725fc629ff2545b061407305ae51016c9f928fce Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 23 May 2016 16:24:05 -0700 Subject: kernek/fork.c: allocate idle task for a CPU always on its local node Linux preallocates the task structs of the idle tasks for all possible CPUs. This currently means they all end up on node 0. This also implies that the cache line of MWAIT, which is around the flags field in the task struct, are all located in node 0. We see a noticeable performance improvement on Knights Landing CPUs when the cache lines used for MWAIT are located in the local nodes of the CPUs using them. I would expect this to give a (likely slight) improvement on other systems too. The patch implements placing the idle task in the node of its CPUs, by passing the right target node to copy_process() [akpm@linux-foundation.org: use NUMA_NO_NODE, not a bare -1] Link: http://lkml.kernel.org/r/1463492694-15833-1-git-send-email-andi@firstfloor.org Signed-off-by: Andi Kleen Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 103d78fd8f75..e67d7b773348 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -340,13 +340,14 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; struct thread_info *ti; - int node = tsk_fork_get_node(orig); int err; + if (node == NUMA_NO_NODE) + node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; @@ -1276,7 +1277,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls) + unsigned long tls, + int node) { int retval; struct task_struct *p; @@ -1328,7 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, node); if (!p) goto fork_out; @@ -1706,7 +1708,8 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1751,7 +1754,7 @@ long _do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls); + child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. -- cgit v1.2.3 From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 23 May 2016 16:24:10 -0700 Subject: kexec: introduce a protection mechanism for the crashkernel reserved memory For the cases that some kernel (module) path stamps the crash reserved memory(already mapped by the kernel) where has been loaded the second kernel data, the kdump kernel will probably fail to boot when panic happens (or even not happens) leaving the culprit at large, this is unacceptable. The patch introduces a mechanism for detecting such cases: 1) After each crash kexec loading, it simply marks the reserved memory regions readonly since we no longer access it after that. When someone stamps the region, the first kernel will panic and trigger the kdump. The weak arch_kexec_protect_crashkres() is introduced to do the actual protection. 2) To allow multiple loading, once 1) was done we also need to remark the reserved memory to readwrite each time a system call related to kdump is made. The weak arch_kexec_unprotect_crashkres() is introduced to do the actual protection. The architecture can make its specific implementation by overriding arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres(). Signed-off-by: Xunlei Pang Cc: Eric Biederman Cc: Dave Young Cc: Minfei Huang Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++++++- kernel/kexec_core.c | 6 ++++++ kernel/kexec_file.c | 8 +++++++- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ee70aef5cd81..b44cb3f5a15c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } + if (nr_segments > 0) { unsigned long i; @@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, image = xchg(dest_image, image); out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5d408252992..48b73cc8e425 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void) void __weak crash_unmap_reserved_pages(void) {} + +void __weak arch_kexec_protect_crashkres(void) +{} + +void __weak arch_kexec_unprotect_crashkres(void) +{} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c72d2ff5896e..503bc2d348e5 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) + if (flags & KEXEC_FILE_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: + if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); return ret; -- cgit v1.2.3 From 917a35605f09c0d16aeb2e92c7fbff562e19a116 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 23 May 2016 16:24:16 -0700 Subject: kexec: make a pair of map/unmap reserved pages in error path For some arch, kexec shall map the reserved pages, then use them, when we try to start the kdump service. kexec may return directly, without unmaping the reserved pages, if it fails during starting service. To fix it, we make a pair of map/unmap reserved pages both in generic path and error path. This patch only affects s390. Other architecturess don't implement the interface of crash_unmap_reserved_pages and crash_map_reserved_pages. It isn't a urgent patch. Kernel can work well without any risk, although the reserved pages are not unmapped before returning in error path. Signed-off-by: Minfei Huang Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Xunlei Pang Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index b44cb3f5a15c..4b49aa71304f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -194,22 +194,25 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, segments, flags); } if (result) - goto out; + goto unmap_page; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; result = machine_kexec_prepare(image); if (result) - goto out; + goto unmap_page; for (i = 0; i < nr_segments; i++) { result = kimage_load_segment(image, &image->segment[i]); if (result) - goto out; + goto unmap_page; } kimage_terminate(image); +unmap_page: if (flags & KEXEC_ON_CRASH) crash_unmap_reserved_pages(); + if (result) + goto out; } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); -- cgit v1.2.3 From 0eea08678ebe9f7d8ef98fed974a5bf1a0dd2dd2 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 23 May 2016 16:24:19 -0700 Subject: kexec: do a cleanup for function kexec_load There are a lof of work to be done in function kexec_load, not only for allocating structs and loading initram, but also for some misc. To make it more clear, wrap a new function do_kexec_load which is used to allocate structs and load initram. And the pre-work will be done in kexec_load. Signed-off-by: Minfei Huang Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Xunlei Pang Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 125 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 69 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b49aa71304f..b73dc211fcfd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -103,6 +103,74 @@ out_free_image: return ret; } +static int do_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int ret; + + if (flags & KEXEC_ON_CRASH) { + dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; + } + + if (nr_segments == 0) { + /* Uninstall image */ + kimage_free(xchg(dest_image, NULL)); + return 0; + } + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + } + + ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); + if (ret) + return ret; + + if (flags & KEXEC_ON_CRASH) + crash_map_reserved_pages(); + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < nr_segments; i++) { + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* Install the new kernel and uninstall the old */ + image = xchg(dest_image, image); + +out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + + /* + * Once the reserved memory is mapped, we should unmap this memory + * before returning + */ + if (flags & KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); + return ret; +} + /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -127,7 +195,6 @@ out_free_image: SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -152,9 +219,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -166,63 +230,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) { - dest_image = &kexec_crash_image; - if (kexec_crash_image) - arch_kexec_unprotect_crashkres(); - } - - if (nr_segments > 0) { - unsigned long i; - - if (flags & KEXEC_ON_CRASH) { - /* - * Loading another kernel to switch to if this one - * crashes. Free any current crash dump kernel before - * we corrupt it. - */ - - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - } - if (result) - goto unmap_page; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto unmap_page; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto unmap_page; - } - kimage_terminate(image); -unmap_page: - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - if (result) - goto out; - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + result = do_kexec_load(entry, nr_segments, segments, flags); -out: if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); mutex_unlock(&kexec_mutex); - kimage_free(image); return result; } -- cgit v1.2.3 From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 23 May 2016 16:24:22 -0700 Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and arch_kexec_protect(unprotect)_crashkres() Commit 3f625002581b ("kexec: introduce a protection mechanism for the crashkernel reserved memory") is a similar mechanism for protecting the crash kernel reserved memory to previous crash_map/unmap_reserved_pages() implementation, the new one is more generic in name and cleaner in code (besides, some arch may not be allowed to unmap the pgtable). Therefore, this patch consolidates them, and uses the new arch_kexec_protect(unprotect)_crashkres() to replace former crash_map/unmap_reserved_pages() which by now has been only used by S390. The consolidation work needs the crash memory to be mapped initially, this is done in machine_kdump_pm_init() which is after reserve_crashkernel(). Once kdump kernel is loaded, the new arch_kexec_protect_crashkres() implemented for S390 will actually unmap the pgtable like before. Signed-off-by: Xunlei Pang Signed-off-by: Michael Holzheu Acked-by: Michael Holzheu Cc: Heiko Carstens Cc: "Eric W. Biederman" Cc: Minfei Huang Cc: Vivek Goyal Cc: Dave Young Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 12 ------------ kernel/kexec_core.c | 11 ++--------- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index b73dc211fcfd..4384672d3245 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) return ret; - if (flags & KEXEC_ON_CRASH) - crash_map_reserved_pages(); - if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -161,12 +158,6 @@ out: if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - /* - * Once the reserved memory is mapped, we should unmap this memory - * before returning - */ - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); kimage_free(image); return ret; } @@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, result = do_kexec_load(entry, nr_segments, segments, flags); - if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) - arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); return result; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 48b73cc8e425..56b3ed0927b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) @@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size) crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); @@ -1553,17 +1551,12 @@ int kernel_kexec(void) } /* - * Add and remove page tables for crashkernel memory + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - void __weak arch_kexec_protect_crashkres(void) {} -- cgit v1.2.3 From 7c051267931a9be9c6620cc17b362bc6ee6dedc8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:48 -0700 Subject: mm, fork: make dup_mmap wait for mmap_sem for write killable dup_mmap needs to lock current's mm mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e67d7b773348..47887bba944f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; uprobe_start_dup_mmap(); - down_write(&oldmm->mmap_sem); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -526,6 +529,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); +fail_uprobe_end: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: -- cgit v1.2.3 From 17b0573d77fbd547cbf1d711b90d269bdcc1efd3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:26:05 -0700 Subject: prctl: make PR_SET_THP_DISABLE wait for mmap_sem killable PR_SET_THP_DISABLE requires mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index cf8ba545c7d3..89d5be418157 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - down_write(&me->mm->mmap_sem); + if (down_write_killable(&me->mm->mmap_sem)) + return -EINTR; if (arg2) me->mm->def_flags |= VM_NOHUGEPAGE; else -- cgit v1.2.3 From 598fdc1d66674264e122ca9d007ad822e98d8b8d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:26:08 -0700 Subject: uprobes: wait for mmap_sem for write killable xol_add_vma needs mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Do not warn in dup_xol_work if __create_xol_area failed due to fatal signal pending because this is usually considered a kernel issue. Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c01f733ff2e1..b7a525ab2083 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1130,7 +1130,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; @@ -1469,7 +1471,8 @@ static void dup_xol_work(struct callback_head *work) if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->dup_xol_addr)) + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } -- cgit v1.2.3 From 59fa5860204ffc95128d60cba9f54f9740a42c7d Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 24 May 2016 11:42:30 +0100 Subject: genirq: Fix missing return value in irq_destroy_ipi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7cec18a3906b changed the return type of irq_destroy_ipi to int, but missed adding a value to one return statement. Fix this to silence the resulting compiler warning: kernel/irq/ipi.c In function ‘irq_destroy_ipi’: kernel/irq/ipi.c:128:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type] Fixes: 7cec18a3906b "genirq: Add error code reporting to irq_{reserve,destroy}_ipi" Signed-off-by: Matt Redfearn Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/1464086550-24734-1-git-send-email-matt.redfearn@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index c42742208e5e..89b49f6773f0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) domain = data->domain; if (WARN_ON(domain == NULL)) - return; + return -EINVAL; if (!irq_domain_is_ipi(domain)) { pr_warn("Trying to destroy a non IPI domain!\n"); -- cgit v1.2.3 From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2016 11:19:07 +0200 Subject: sched/core: Fix remote wakeups Commit: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration") ... introduced a bug: Mike Galbraith found that it introduced a performance regression, while Paul E. McKenney reported lost wakeups and bisected it to this commit. The reason is that I mis-read ttwu_queue() such that I assumed any wakeup that got a remote queue must have had the task migrated. Since this is not so; we need to transfer this information between queueing the wakeup and actually doing the wakeup. Use a new task_struct::sched_flag for this, we already write to sched_contributes_to_load in the wakeup path so this is a hot and modified cacheline. Reported-by: Paul E. McKenney Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Ben Segall Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Matt Fleming Cc: Morten Rasmussen Cc: Oleg Nesterov Cc: Paul Turner Cc: Pavan Kondeti Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: byungchul.park@lge.com Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration") Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 404c0784b1fc..7f2cae4620c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void) cookie = lockdep_pin_lock(&rq->lock); while (llist) { + int wake_flags = 0; + p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - /* - * See ttwu_queue(); we only call ttwu_queue_remote() when - * its a x-cpu wakeup. - */ - ttwu_do_activate(rq, p, WF_MIGRATED, cookie); + + if (p->sched_remote_wakeup) + wake_flags = WF_MIGRATED; + + ttwu_do_activate(rq, p, wake_flags, cookie); } lockdep_unpin_lock(&rq->lock, cookie); @@ -1819,10 +1821,12 @@ void scheduler_ipi(void) irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_send_reschedule(cpu); @@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); + ttwu_queue_remote(p, cpu, wake_flags); return; } #endif -- cgit v1.2.3 From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 May 2016 00:04:58 -0400 Subject: add down_write_killable_nested() Signed-off-by: Al Viro --- kernel/locking/rwsem.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c817216c1615..2e853ad93a3a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable_nested); + void up_read_non_owner(struct rw_semaphore *sem) { __up_read(sem); -- cgit v1.2.3 From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 26 May 2016 15:16:22 -0700 Subject: mm: oom_reaper: remove some bloat mmput_async is currently used only from the oom_reaper which is defined only for CONFIG_MMU. We can save work_struct in mm_struct for !CONFIG_MMU. [akpm@linux-foundation.org: fix typo, per Minchan] Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz Reported-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Minchan Kim Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 47887bba944f..5c2c355aa97f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); @@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +#endif /** * set_mm_exe_file - change a reference to the mm's executable file -- cgit v1.2.3 From 287980e49ffc0f6d911601e7e352a812ed27768e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 May 2016 23:23:25 +0200 Subject: remove lots of IS_ERR_VALUE abuses Most users of IS_ERR_VALUE() in the kernel are wrong, as they pass an 'int' into a function that takes an 'unsigned long' argument. This happens to work because the type is sign-extended on 64-bit architectures before it gets converted into an unsigned type. However, anything that passes an 'unsigned short' or 'unsigned int' argument into IS_ERR_VALUE() is guaranteed to be broken, as are 8-bit integers and types that are wider than 'unsigned long'. Andrzej Hajda has already fixed a lot of the worst abusers that were causing actual bugs, but it would be nice to prevent any users that are not passing 'unsigned long' arguments. This patch changes all users of IS_ERR_VALUE() that I could find on 32-bit ARM randconfig builds and x86 allmodconfig. For the moment, this doesn't change the definition of IS_ERR_VALUE() because there are probably still architecture specific users elsewhere. Almost all the warnings I got are for files that are better off using 'if (err)' or 'if (err < 0)'. The only legitimate user I could find that we get a warning for is the (32-bit only) freescale fman driver, so I did not remove the IS_ERR_VALUE() there but changed the type to 'unsigned long'. For 9pfs, I just worked around one user whose calling conventions are so obscure that I did not dare change the behavior. I was using this definition for testing: #define IS_ERR_VALUE(x) ((unsigned long*)NULL == (typeof (x)*)NULL && \ unlikely((unsigned long long)(x) >= (unsigned long long)(typeof(x))-MAX_ERRNO)) which ends up making all 16-bit or wider types work correctly with the most plausible interpretation of what IS_ERR_VALUE() was supposed to return according to its users, but also causes a compile-time warning for any users that do not pass an 'unsigned long' argument. I suggested this approach earlier this year, but back then we ended up deciding to just fix the users that are obviously broken. After the initial warning that caused me to get involved in the discussion (fs/gfs2/dir.c) showed up again in the mainline kernel, Linus asked me to send the whole thing again. [ Updated the 9p parts as per Al Viro - Linus ] Signed-off-by: Arnd Bergmann Cc: Andrzej Hajda Cc: Andrew Morton Link: https://lkml.org/lkml/2016/1/7/363 Link: https://lkml.org/lkml/2016/5/27/486 Acked-by: Srinivas Kandagatla # For nvmem part Signed-off-by: Linus Torvalds --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 4d73a834c7e6..f66162f2359b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (IS_ERR_VALUE(nr)) { + if (nr < 0) { retval = nr; goto out_free; } -- cgit v1.2.3 From c08376ac97cb202ec65320f3d90d5c4c5e2adb0b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 26 May 2016 17:21:05 -0700 Subject: timer: Export destroy_hrtimer_on_stack() hrtimer_init_on_stack() needs a matching call to destroy_hrtimer_on_stack(), so both need to be exported. Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8c7392c4fdbd..e99df0ff1d42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } -- cgit v1.2.3 From 9bd616e3dbedfc103f158197c8ad93678849b1ed Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 1 Jun 2016 18:52:16 +0100 Subject: cpuidle: Do not access cpuidle_devices when !CONFIG_CPU_IDLE The cpuidle_devices per-CPU variable is only defined when CPU_IDLE is enabled. Commit c8cc7d4de7a4 ("sched/idle: Reorganize the idle loop") removed the #ifdef CONFIG_CPU_IDLE around cpuidle_idle_call() with the compiler optimising away __this_cpu_read(cpuidle_devices). However, with CONFIG_UBSAN && !CONFIG_CPU_IDLE, this optimisation no longer happens and the kernel fails to link since cpuidle_devices is not defined. This patch introduces an accessor function for the current CPU cpuidle device (returning NULL when !CONFIG_CPU_IDLE) and uses it in cpuidle_idle_call(). Signed-off-by: Catalin Marinas Cc: 4.5+ # 4.5+ Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..c5aeedf4e93a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void cpuidle_idle_call(void) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; -- cgit v1.2.3 From 0422e83d84ae24b933e4b0d4c1e0f0b4ae8a0a3b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 26 May 2016 21:08:17 +0100 Subject: locking/ww_mutex: Report recursive ww_mutex locking early Recursive locking for ww_mutexes was originally conceived as an exception. However, it is heavily used by the DRM atomic modesetting code. Currently, the recursive deadlock is checked after we have queued up for a busy-spin and as we never release the lock, we spin until kicked, whereupon the deadlock is discovered and reported. A simple solution for the now common problem is to move the recursive deadlock discovery to the first action when taking the ww_mutex. Suggested-by: Maarten Lankhorst Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maarten Lankhorst Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1464293297-19777-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e364b424b019..79d2d765a75f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); -- cgit v1.2.3 From 5b6c1b4d46b0dae4edea636a776d09f2064f4cd7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 4 Jun 2016 20:50:59 +0200 Subject: bpf, trace: use READ_ONCE for retrieving file ptr In bpf_perf_event_read() and bpf_perf_event_output(), we must use READ_ONCE() for fetching the struct file pointer, which could get updated concurrently, so we must prevent the compiler from potential refetching. We already do this with tail calls for fetching the related bpf_prog, but not so on stored perf events. Semantics for both are the same with regards to updates. Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 780bcbe1d4de..720b7bb01d43 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -198,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; @@ -247,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; -- cgit v1.2.3 From 2c610022711675ee908b903d242f0b90e1db661f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 10:19:51 +0200 Subject: locking/qspinlock: Fix spin_unlock_wait() some more While this prior commit: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()") ... fixes spin_is_locked() and spin_unlock_wait() for the usage in ipc/sem and netfilter, it does not in fact work right for the usage in task_work and futex. So while the 2 locks crossed problem: spin_lock(A) spin_lock(B) if (!spin_is_locked(B)) spin_unlock_wait(A) foo() foo(); ... works with the smp_mb() injected by both spin_is_locked() and spin_unlock_wait(), this is not sufficient for: flag = 1; smp_mb(); spin_lock() spin_unlock_wait() if (!flag) // add to lockless list // iterate lockless list ... because in this scenario, the store from spin_lock() can be delayed past the load of flag, uncrossing the variables and loosing the guarantee. This patch reworks spin_is_locked() and spin_unlock_wait() to work in both cases by exploiting the observation that while the lock byte store can be delayed, the contender must have registered itself visibly in other state contained in the word. It also allows for architectures to override both functions, as PPC and ARM64 have an additional issue for which we currently have no generic solution. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: stable@vger.kernel.org # v4.2 and later Fixes: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()") Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ce2f75e32ae1..5fc8c311b8fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,66 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 + * + * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 + * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ + u32 val; + + for (;;) { + val = atomic_read(&lock->val); + + if (!val) /* not locked, we're done */ + goto done; + + if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ + break; + + /* not locked, but pending, wait until we observe the lock */ + cpu_relax(); + } + + /* any unlock is good */ + while (atomic_read(&lock->val) & _Q_LOCKED_MASK) + cpu_relax(); + +done: + smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif + #endif /* _GEN_PV_LOCK_SLOWPATH */ /** -- cgit v1.2.3 From 62a92c8f553e49270a0ee391b8733da71ab0aebc Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 7 Jun 2016 15:44:15 +0300 Subject: perf/core: Remove a redundant check There is no way to end up in _free_event() with event::pmu being NULL. The latter is initialized in event allocation path and remains set forever. In case of allocation failure, the error path doesn't use _free_event(). Having the check, however, suggests that it is possible to have a event::pmu==NULL situation in _free_event() and confuses the robots. This patch gets rid of the check. Reported-by: Dan Carpenter Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/1465303455-26032-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 050a290c72c7..87e945d6ebb8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } + exclusive_event_destroy(event); + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } -- cgit v1.2.3 From 9c57259117b9c25472a3fa6d5a14d6bb3b647e87 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2016 17:58:40 -0500 Subject: sched/debug: Fix /proc/sched_debug regression Commit: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") ... introduced a bug when CONFIG_SCHEDSTATS is enabled and the runtime tunable is disabled (which is the default). The wait-time, sum-exec, and sum-sleep fields are missing from the /proc/sched_debug file in the runnable_tasks section. Fix it with a new schedstat_val() macro which returns the field value when schedstats is enabled and zero otherwise. The macro works with both SCHEDSTATS and !SCHEDSTATS. I put the macro in stats.h since it might end up being useful in other places. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/bcda7c2790cf2ccbe586a28c02dd7b6fe7749a2b.1464994423.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 15 ++++----------- kernel/sched/stats.h | 3 +++ 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cf905f655ba1..0368c393a336 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) { - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); - } -#else + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - 0LL, 0L, + SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - 0LL, 0L); -#endif + SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 70b3b6a20fb0..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) + #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) +# define schedstat_val(rq, field) 0 #endif #ifdef CONFIG_SCHED_INFO -- cgit v1.2.3 From 4698f88c06b893f2acc0b443004a53bf490fde7c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 7 Jun 2016 14:43:16 -0500 Subject: sched/debug: Fix 'schedstats=enable' cmdline option The 'schedstats=enable' option doesn't work, and also produces the following warning during boot: WARNING: CPU: 0 PID: 0 at /home/jpoimboe/git/linux/kernel/jump_label.c:61 static_key_slow_inc+0x8c/0xa0 static_key_slow_inc used before call to jump_label_init Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc1+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 0000000000000086 3ae3475a4bea95d4 ffffffff81e03da8 ffffffff8143fc83 ffffffff81e03df8 0000000000000000 ffffffff81e03de8 ffffffff810b1ffb 0000003d00000096 ffffffff823514d0 ffff88007ff197c8 0000000000000000 Call Trace: [] dump_stack+0x85/0xc2 [] __warn+0xcb/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] static_key_slow_inc+0x8c/0xa0 [] static_key_enable+0x16/0x40 [] setup_schedstats+0x29/0x94 [] unknown_bootoption+0x89/0x191 [] parse_args+0x297/0x4b0 [] start_kernel+0x1d8/0x4a9 [] ? set_init_arg+0x55/0x55 [] ? early_idt_handler_array+0x120/0x120 [] x86_64_start_reservations+0x2f/0x31 [] x86_64_start_kernel+0x14a/0x16d The problem is that it tries to update the 'sched_schedstats' static key before jump labels have been initialized. Changing jump_label_init() to be called earlier before parse_early_param() wouldn't fix it: it would still fail trying to poke_text() because mm isn't yet initialized. Instead, just create a temporary '__sched_schedstats' variable which can be copied to the static key later during sched_init() after jump labels have been initialized. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/453775fe3433bed65731a583e228ccea806d18cd.1465322027.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f2cae4620c7..385c947482e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +#ifdef CONFIG_SCHEDSTATS + DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS static void set_schedstats(bool enabled) { if (enabled) @@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str) if (!str) goto out; + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ if (!strcmp(str, "enable")) { - set_schedstats(true); + __sched_schedstats = true; ret = 1; } else if (!strcmp(str, "disable")) { - set_schedstats(false); + __sched_schedstats = false; ret = 1; } out: @@ -2293,6 +2300,11 @@ out: } __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write, set_schedstats(state); return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ /* * fork()/clone()-time setup: @@ -7487,6 +7501,8 @@ void __init sched_init(void) #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } -- cgit v1.2.3 From 077fa7aed17de5022e44bf07dbaf732078b7b5b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 8 Jun 2016 14:25:22 +0100 Subject: futex: Calculate the futex key based on a tail page for file-based futexes Mike Galbraith reported that the LTP test case futex_wake04 was broken by commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()"). This test case uses futexes backed by hugetlbfs pages and so there is an associated inode with a futex stored on such pages. The problem is that the key is being calculated based on the head page index of the hugetlbfs page and not the tail page. Prior to the optimisation, the page lock was used to stabilise mappings and pin the inode is file-backed which is overkill. If the page was a compound page, the head page was automatically looked up as part of the page lock operation but the tail page index was used to calculate the futex key. After the optimisation, the compound head is looked up early and the page lock is only relied upon to identify truncated pages, special pages or a shmem page moving to swapcache. The head page is looked up because without the page lock, special care has to be taken to pin the inode correctly. However, the tail page is still required to calculate the futex key so this patch records the tail page. On vanilla 4.6, the output of the test case is; futex_wake04 0 TINFO : Hugepagesize 2097152 futex_wake04 1 TFAIL : futex_wake04.c:126: Bug: wait_thread2 did not wake after 30 secs. With the patch applied futex_wake04 0 TINFO : Hugepagesize 2097152 futex_wake04 1 TPASS : Hi hydra, thread2 awake! Fixes: 65d8fc777f6d "futex: Remove requirement for lock_page() in get_futex_key()" Reported-and-tested-by: Mike Galbraith Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Cc: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160608132522.GM2469@suse.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ee25f5ba4aca..33664f70e2d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *tail; struct address_space *mapping; int err, ro = 0; @@ -530,7 +530,15 @@ again: * considered here and page lock forces unnecessarily serialization * From this point on, mapping will be re-verified if necessary and * page lock will be acquired only if it is unavoidable - */ + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; page = compound_head(page); mapping = READ_ONCE(page->mapping); @@ -654,7 +662,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = inode; - key->shared.pgoff = basepage_index(page); + key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } -- cgit v1.2.3 From ba62bafe942b159a6109cbec780d36496e06b6c5 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 8 Jun 2016 15:33:53 -0700 Subject: kernel/relay.c: fix potential memory leak When relay_open_buf() fails in relay_open(), code will goto free_bufs, but chan is nowhere freed. Link: http://lkml.kernel.org/r/1464777927-19675-1-git-send-email-yizhouzhou@ict.ac.cn Signed-off-by: Zhouyi Zhou Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -614,6 +614,7 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); + kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); -- cgit v1.2.3 From 29d6455178a09e1dc340380c582b13356227e8df Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: sched: panic on corrupted stack end Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..11546a6ed5df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3047,7 +3047,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { -- cgit v1.2.3 From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jun 2016 15:07:50 +0200 Subject: sched/fair: Fix post_init_entity_util_avg() serialization Chris Wilson reported a divide by 0 at: post_init_entity_util_avg(): > 725 if (cfs_rq->avg.util_avg != 0) { > 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; > -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1); > 728 > 729 if (sa->util_avg > cap) > 730 sa->util_avg = cap; > 731 } else { Which given the lack of serialization, and the code generated from update_cfs_rq_load_avg() is entirely possible: if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed_load = 1; } turns into: ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax ffffffff8108706b: 48 85 c0 test %rax,%rax ffffffff8108706e: 74 40 je ffffffff810870b0 ffffffff81087070: 4c 89 f8 mov %r15,%rax ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) ffffffff8108707e: 4c 89 f9 mov %r15,%rcx ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx Which you'll note ends up with 'sa->load_avg - r' in memory at ffffffff8108707a. By calling post_init_entity_util_avg() under rq->lock we're sure to be fully serialized against PELT updates and cannot observe intermediate state like this. Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value") Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..13d0896aff87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4e33ad12bb68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; -- cgit v1.2.3 From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 13 Jun 2016 02:32:09 -0500 Subject: sched/debug: Fix deadlock when enabling sched events I see a hang when enabling sched events: echo 1 > /sys/kernel/debug/tracing/events/sched/enable The printk buffer shows: BUG: spinlock recursion on CPU#1, swapper/1/0 lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 ... Call Trace: [] dump_stack+0x85/0xc2 [] spin_dump+0x78/0xc0 [] do_raw_spin_lock+0x11a/0x150 [] _raw_spin_lock+0x61/0x80 [] ? try_to_wake_up+0x256/0x4e0 [] try_to_wake_up+0x256/0x4e0 [] ? _raw_spin_unlock_irqrestore+0x4a/0x80 [] wake_up_process+0x15/0x20 [] insert_work+0x84/0xc0 [] __queue_work+0x18f/0x660 [] queue_work_on+0x46/0x90 [] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper] [] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper] [] soft_cursor+0x1ad/0x230 [] bit_cursor+0x649/0x680 [] ? update_attr.isra.2+0x90/0x90 [] fbcon_cursor+0x14a/0x1c0 [] hide_cursor+0x28/0x90 [] vt_console_print+0x3bf/0x3f0 [] call_console_drivers.constprop.24+0x183/0x200 [] console_unlock+0x3d4/0x610 [] vprintk_emit+0x3c5/0x610 [] vprintk_default+0x29/0x40 [] printk+0x57/0x73 [] enqueue_entity+0xc2e/0xc70 [] enqueue_task_fair+0x59/0xab0 [] ? kvm_sched_clock_read+0x9/0x20 [] ? sched_clock+0x9/0x10 [] activate_task+0x5c/0xa0 [] ttwu_do_activate+0x54/0xb0 [] sched_ttwu_pending+0x7a/0xb0 [] scheduler_ipi+0x61/0x170 [] smp_trace_reschedule_interrupt+0x4f/0x2a0 [] trace_reschedule_interrupt+0x96/0xa0 [] ? native_safe_halt+0x6/0x10 [] ? trace_hardirqs_on+0xd/0x10 [] default_idle+0x20/0x1a0 [] arch_cpu_idle+0xf/0x20 [] default_idle_call+0x2f/0x50 [] cpu_startup_entry+0x37e/0x450 [] start_secondary+0x160/0x1a0 Note the hang only occurs when echoing the above from a physical serial console, not from an ssh session. The bug is caused by a deadlock where the task is trying to grab the rq lock twice because printk()'s aren't safe in sched code. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e33ad12bb68..a2348deab7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3246,7 +3246,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " "kernel parameter schedstats=enabled or " "kernel.sched_schedstats=1\n"); -- cgit v1.2.3 From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..4135ac83cb65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -- cgit v1.2.3 From df4565f9ebdc4d6dc50edc6e8fed08004e328332 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 24 May 2016 14:05:05 +0200 Subject: kernel/kcov: unproxify debugfs file's fops Since commit 49d200deaa68 ("debugfs: prevent access to removed files' private data"), a debugfs file's file_operations methods get proxied through lifetime aware wrappers. However, only a certain subset of the file_operations members is supported by debugfs and ->mmap isn't among them -- it appears to be NULL from the VFS layer's perspective. This behaviour breaks the /sys/kernel/debug/kcov file introduced concurrently with commit 5c9a8750a640 ("kernel: add kcov code coverage"). Since that file never gets removed, there is no file removal race and thus, a lifetime checking proxy isn't needed. Avoid the proxying for /sys/kernel/debug/kcov by creating it via debugfs_create_file_unsafe() rather than debugfs_create_file(). Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data") Fixes: 5c9a8750a640 ("kernel: add kcov code coverage") Reported-by: Sasha Levin Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- kernel/kcov.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = { static int __init kcov_init(void) { - if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + /* + * The kcov debugfs file won't ever get removed and thus, + * there is no need to protect it against removal races. The + * use of debugfs_create_file_unsafe() is actually safe here. + */ + if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { pr_err("failed to create kcov in debugfs\n"); return -ENOMEM; } -- cgit v1.2.3 From 8974189222159154c55f24ddad33e3613960521a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 10:50:40 +0200 Subject: sched/fair: Fix cfs_rq avg tracking underflow As per commit: b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization") > the code generated from update_cfs_rq_load_avg(): > > if (atomic_long_read(&cfs_rq->removed_load_avg)) { > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > sa->load_avg = max_t(long, sa->load_avg - r, 0); > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); > removed_load = 1; > } > > turns into: > > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax > ffffffff8108706b: 48 85 c0 test %rax,%rax > ffffffff8108706e: 74 40 je ffffffff810870b0 > ffffffff81087070: 4c 89 f8 mov %r15,%rax > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx > > Which you'll note ends up with sa->load_avg -= r in memory at > ffffffff8108707a. So I _should_ have looked at other unserialized users of ->load_avg, but alas. Luckily nikbor reported a similar /0 from task_h_load() which instantly triggered recollection of this here problem. Aside from the intermediate value hitting memory and causing problems, there's another problem: the underflow detection relies on the signed bit. This reduces the effective width of the variables, IOW its effectively the same as having these variables be of signed type. This patch changes to a different means of unsigned underflow detection to not rely on the signed bit. This allows the variables to use the 'full' unsigned range. And it does so with explicit LOAD - STORE to ensure any intermediate value will never be visible in memory, allowing these unserialized loads. Note: GCC generates crap code for this, might warrant a look later. Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; maybe we should do clamping on add too. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: kernel@kyup.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2348deab7a3..2ae68f0e3bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) } } +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; } @@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); cfs_rq_util_change(cfs_rq); } -- cgit v1.2.3 From 70c8217acd4383e069fe1898bbad36ea4fcdbdcc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 16:10:42 -0400 Subject: tracing: Handle NULL formats in hold_module_trace_bprintk_format() If a task uses a non constant string for the format parameter in trace_printk(), then the trace_printk_fmt variable is set to NULL. This variable is then saved in the __trace_printk_fmt section. The function hold_module_trace_bprintk_format() checks to see if duplicate formats are used by modules, and reuses them if so (saves them to the list if it is new). But this function calls lookup_format() that does a strcmp() to the value (which is now NULL) and can cause a kernel oops. This wasn't an issue till 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") which added "__used" to the trace_printk_fmt variable, and before that, the kernel simply optimized it out (no NULL value was saved). The fix is simply to handle the NULL pointer in lookup_format() and have the caller ignore the value if it was NULL. Link: http://lkml.kernel.org/r/1464769870-18344-1-git-send-email-zhengjun.xing@intel.com Reported-by: xingzhen Acked-by: Namhyung Kim Fixes: 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } -- cgit v1.2.3 From 6720a305df74ca30bcc10fc316881641b6ff0c80 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jun 2016 12:11:17 -0700 Subject: locking: avoid passing around 'thread_info' in mutex debugging code None of the code actually wants a thread_info, it all wants a task_struct, and it's just converting back and forth between the two ("ti->task" to get the task_struct from the thread_info, and "task_thread_info(task)" to go the other way). No semantic change. Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/locking/mutex-debug.c | 12 ++++++------ kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.c | 6 +++--- kernel/locking/mutex.h | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 79d2d765a75f..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -- cgit v1.2.3 From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Jun 2016 18:52:17 +0200 Subject: locking/static_key: Fix concurrent static_key_slow_inc() The following scenario is possible: CPU 1 CPU 2 static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 0, no increment jump_label_lock() atomic_inc_return() -> key.enabled == 1 now static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 1, inc to 2 return ** static key is wrong! jump_label_update() jump_label_unlock() Testing the static key at the point marked by (**) will follow the wrong path for jumps that have not been patched yet. This can actually happen when creating many KVM virtual machines with userspace LAPIC emulation; just run several copies of the following program: #include #include #include #include int main(void) { for (;;) { int kvmfd = open("/dev/kvm", O_RDONLY); int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); close(ioctl(vmfd, KVM_CREATE_VCPU, 1)); close(vmfd); close(kvmfd); } return 0; } Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call. The static key's purpose is to skip NULL pointer checks and indeed one of the processes eventually dereferences NULL. As explained in the commit that introduced the bug: 706249c222f6 ("locking/static_keys: Rework update logic") jump_label_update() needs key.enabled to be true. The solution adopted here is to temporarily make key.enabled == -1, and use go down the slow path when key.enabled <= 0. Reported-by: Dmitry Vyukov Signed-off-by: Paolo Bonzini Signed-off-by: Peter Zijlstra (Intel) Cc: # v4.3+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 706249c222f6 ("locking/static_keys: Rework update logic") Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com [ Small stylistic edits to the changelog and the code. ] Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); -- cgit v1.2.3 From 094f469172e00d6ab0a3130b0e01c83b3cf3a98d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:01 +0300 Subject: sched/fair: Initialize throttle_count for new task-groups lazily Cgroup created inside throttled group must inherit current throttle_count. Broken throttle_count allows to nominate throttled entries as a next buddy, later this leads to null pointer dereference in pick_next_task_fair(). This patch initialize cfs_rq->throttle_count at first enqueue: laziness allows to skip locking all rq at group creation. Lazy approach also allows to skip full sub-tree scan at throttling hierarchy (not in this patch). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++++++++ kernel/sched/sched.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ae68f0e3bf5..8c5d8c0c8827 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4202,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -437,7 +437,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:15 +0300 Subject: sched/fair: Do not announce throttled next buddy in dequeue_task_fair() Hierarchy could be already throttled at this point. Throttled next buddy could trigger a NULL pointer dereference in pick_next_task_fair(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5d8c0c8827..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4537,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v1.2.3 From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Jun 2016 15:35:04 -0400 Subject: sched/core: Allow kthreads to fall back to online && !active cpus During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is online but not active. A CPU_ONLINE callback may create or bind a kthread so that its cpus_allowed mask only allows the CPU which is being brought online. The kthread may start executing before the CPU is made active and can end up in select_fallback_rq(). In such cases, the expected behavior is selecting the CPU which is coming online; however, because select_fallback_rq() only chooses from active CPUs, it determines that the task doesn't have any viable CPU in its allowed mask and ends up overriding it to cpu_possible_mask. CPU_ONLINE callbacks should be able to put kthreads on the CPU which is coming online. Update select_fallback_rq() so that it follows cpu_online() rather than cpu_active() for kthreads. Reported-by: Gautham R Shenoy Tested-by: Gautham R. Shenoy Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Abdul Haleem Cc: Aneesh Kumar Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4135ac83cb65..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_active(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) + continue; + if (!cpu_online(dest_cpu)) continue; goto out; } -- cgit v1.2.3 From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 15:09:37 -0700 Subject: Clarify naming of thread info/stack allocators We've had the thread info allocated together with the thread stack for most architectures for a long time (since the thread_info was split off from the task struct), but that is about to change. But the patches that move the thread info to be off-stack (and a part of the task struct instead) made it clear how confused the allocator and freeing functions are. Because the common case was that we share an allocation with the thread stack and the thread_info, the two pointers were identical. That identity then meant that we would have things like ti = alloc_thread_info_node(tsk, node); ... tsk->stack = ti; which certainly _worked_ (since stack and thread_info have the same value), but is rather confusing: why are we assigning a thread_info to the stack? And if we move the thread_info away, the "confusing" code just gets to be entirely bogus. So remove all this confusion, and make it clear that we are doing the stack allocation by renaming and clarifying the function names to be about the stack. The fact that the thread_info then shares the allocation is an implementation detail, and not really about the allocation itself. This is a pure renaming and type fix: we pass in the same pointer, it's just that we clarify what the pointer means. The ia64 code that actually only has one single allocation (for all of task_struct, thread_info and kernel thread stack) now looks a bit odd, but since "tsk->stack" is actually not even used there, that oddity doesn't matter. It would be a separate thing to clean that up, I intentionally left the ia64 changes as a pure brute-force renaming and type change. Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- kernel/fork.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..37b9439b8c07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(ti); + struct page *page = virt_to_page(stack); memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; -- cgit v1.2.3 From 74070542099c66d87aebeacd7b54dc0e8b6a73f9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Jun 2016 14:50:16 -0700 Subject: oom, suspend: fix oom_reaper vs. oom_killer_disable race Tetsuo has reported the following potential oom_killer_disable vs. oom_reaper race: (1) freeze_processes() starts freezing user space threads. (2) Somebody (maybe a kenrel thread) calls out_of_memory(). (3) The OOM killer calls mark_oom_victim() on a user space thread P1 which is already in __refrigerator(). (4) oom_killer_disable() sets oom_killer_disabled = true. (5) P1 leaves __refrigerator() and enters do_exit(). (6) The OOM reaper calls exit_oom_victim(P1) before P1 can call exit_oom_victim(P1). (7) oom_killer_disable() returns while P1 not yet finished (8) P1 perform IO/interfere with the freezer. This situation is unfortunate. We cannot move oom_killer_disable after all the freezable kernel threads are frozen because the oom victim might depend on some of those kthreads to make a forward progress to exit so we could deadlock. It is also far from trivial to teach the oom_reaper to not call exit_oom_victim() because then we would lose a guarantee of the OOM killer and oom_killer_disable forward progress because exit_mm->mmput might block and never call exit_oom_victim. It seems the easiest way forward is to workaround this race by calling try_to_freeze_tasks again after oom_killer_disable. This will make sure that all the tasks are frozen or it bails out. Fixes: 449d777d7ad6 ("mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper") Link: http://lkml.kernel.org/r/1466597634-16199-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,6 +146,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; -- cgit v1.2.3 From 9521d39976db20f8ef9b56af66661482a17d5364 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 25 Jun 2016 21:53:30 +1000 Subject: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE Commit b235beea9e99 ("Clarify naming of thread info/stack allocators") breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE: kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack' kernel/fork.c:355:8: error: assignment from incompatible pointer type stack = alloc_thread_stack_node(tsk, node); ^ Fix it by renaming free_stack() to free_thread_stack(), and updating the return type of alloc_thread_stack_node(). Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 37b9439b8c07..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -183,13 +183,13 @@ static inline void free_thread_stack(unsigned long *stack) # else static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_stack(unsigned long *stack) +static void free_thread_stack(unsigned long *stack) { kmem_cache_free(thread_stack_cache, stack); } -- cgit v1.2.3