From 63662139e519ce06090b2759cf4a1d291b9cc0e2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 14 Mar 2013 13:23:11 +0000 Subject: params: Fix potential memory leak in add_sysfs_param() On allocation failure, it would fail to free the old attrs array which was no longer referenced by anything (since it would free the old module_param_attrs struct on the way out). Comment the suspicious-looking krealloc() usage to explain why it *isn't* actually buggy, despite looking like a classic realloc() usage bug. Signed-off-by: David Woodhouse --- kernel/params.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index ed35345be536..53b958fcd639 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), GFP_KERNEL); if (!new) { - kfree(mk->mp); + kfree(attrs); err = -ENOMEM; goto fail; } + /* Despite looking like the typical realloc() bug, this is safe. + * We *want* the old 'attrs' to be freed either way, and we'll store + * the new one in the success case. */ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); if (!attrs) { err = -ENOMEM; -- cgit v1.2.3 From 37eebe39c9731a76535f08de455db97eb93894ae Mon Sep 17 00:00:00 2001 From: Matvejchikov Ilya Date: Tue, 13 Dec 2011 23:09:08 +0300 Subject: audit: improve GID/EGID comparation logic It is useful to extend GID/EGID comparation logic to be able to match not only the exact EID/EGID values but the group/egroup also. Signed-off-by: Matvejchikov Ilya Signed-off-by: Eric Paris --- kernel/auditsc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a9..77c705c302f7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -633,9 +633,23 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_group_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_group_p(f->gid); + } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_egroup_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_egroup_p(f->gid); + } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); -- cgit v1.2.3 From b551d1d98197b7dd58fc3ead8d4d01830c09567d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 24 Jan 2013 13:15:10 -0500 Subject: audit: refactor hold queue flush The hold queue flush code is an autonomous chunk of code that can be refactored, removed from kauditd_thread() into flush_hold_queue() and flattenned for better legibility. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 62 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f15..4bf486c3e9e8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -417,34 +417,52 @@ static void kauditd_send_skb(struct sk_buff *skb) consume_skb(skb); } +/* + * flush_hold_queue - empty the hold queue if auditd appears + * + * If auditd just started, drain the queue of messages already + * sent to syslog/printk. Remember loss here is ok. We already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * If you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ +static void flush_hold_queue(void) +{ + struct sk_buff *skb; + + if (!audit_default || !audit_pid) + return; + + skb = skb_dequeue(&audit_skb_hold_queue); + if (likely(!skb)) + return; + + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + + /* + * if auditd just disappeared but we + * dequeued an skb we need to drop ref + */ + if (skb) + consume_skb(skb); +} + static int kauditd_thread(void *dummy) { struct sk_buff *skb; set_freezable(); while (!kthread_should_stop()) { - /* - * if auditd just started drain the queue of messages already - * sent to syslog/printk. remember loss here is ok. we already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. - * - * if you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. - */ - if (audit_default && audit_pid) { - skb = skb_dequeue(&audit_skb_hold_queue); - if (unlikely(skb)) { - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } - } - } + flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); -- cgit v1.2.3 From 3320c5133dd83df58b8fbc529b5419e02ca16fe6 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 24 Jan 2013 13:15:11 -0500 Subject: audit: flatten kauditd_thread wait queue code The wait queue control code in kauditd_thread() was nested deeper than necessary. The function has been flattened for better legibility. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 4bf486c3e9e8..1531efbd11e2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -458,10 +458,11 @@ static void flush_hold_queue(void) static int kauditd_thread(void *dummy) { - struct sk_buff *skb; - set_freezable(); while (!kthread_should_stop()) { + struct sk_buff *skb; + DECLARE_WAITQUEUE(wait, current); + flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); @@ -471,19 +472,18 @@ static int kauditd_thread(void *dummy) kauditd_send_skb(skb); else audit_printk_skb(skb); - } else { - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); + schedule(); } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); } return 0; } -- cgit v1.2.3 From 6ff5e45985c2fcb97947818f66d1eeaf9d6600b2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 24 Jan 2013 13:15:12 -0500 Subject: audit: move kaudit thread start from auditd registration to kaudit init The kauditd_thread() task was started only after the auditd userspace daemon registers itself with kaudit. This was fine when only auditd consumed messages from the kaudit netlink unicast socket. With the addition of a multicast group to that socket it is more convenient to have the thread start on init of the kaudit kernel subsystem. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 1531efbd11e2..02a5d9eefa82 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -676,16 +676,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); @@ -974,6 +964,10 @@ static int __init audit_init(void) else audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) + return PTR_ERR(kauditd_task); + skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = AUDIT_INITIALIZED; -- cgit v1.2.3 From 65ada7bc02e2dcea6dea1f11876e712d5ea7e9ba Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 1 Apr 2013 11:00:00 +0400 Subject: audit: destroy long filenames correctly filename should be destroyed via final_putname() instead of __putname() Otherwise this result in following BUGON() in case of long names: kernel BUG at mm/slab.c:3006! Call Trace: kmem_cache_free+0x1c1/0x850 audit_putname+0x88/0x90 putname+0x73/0x80 sys_symlinkat+0x120/0x150 sys_symlink+0x16/0x20 system_call_fastpath+0x16/0x1b Introduced-in: 7950e3852 Signed-off-by: Dmitry Monakhov Reviewed-by: Jeff Layton Signed-off-by: Eric Paris --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 77c705c302f7..b59ffb293ded 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1024,7 +1024,7 @@ static inline void audit_free_names(struct audit_context *context) list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name && n->name_put) - __putname(n->name); + final_putname(n->name); if (n->should_free) kfree(n); } @@ -2050,7 +2050,7 @@ void audit_putname(struct filename *name) BUG_ON(!context); if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; @@ -2061,7 +2061,7 @@ void audit_putname(struct filename *name) n->name, n->name->name ?: "(null)"); } #endif - __putname(name); + final_putname(name); } #if AUDIT_DEBUG else { -- cgit v1.2.3 From 2950fa9d3291b90e9b7663b6a409ea37a97a5e35 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sun, 7 Apr 2013 16:55:23 +0800 Subject: kernel: audit: beautify code, for extern function, better to check its parameters by itself __audit_socketcall is an extern function. better to check its parameters by itself. also can return error code, when fail (find invalid parameters). also use macro instead of real hard code number also give related comments for it. Signed-off-by: Chen Gang [eparis: fix the return value when !CONFIG_AUDIT] Signed-off-by: Eric Paris --- kernel/auditsc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b59ffb293ded..d57ad32db367 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -226,7 +226,7 @@ struct audit_context { union { struct { int nargs; - long args[6]; + long args[AUDITSC_ARGS]; } socketcall; struct { kuid_t uid; @@ -2491,17 +2491,20 @@ int __audit_bprm(struct linux_binprm *bprm) /** * audit_socketcall - record audit data for sys_socketcall - * @nargs: number of args + * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ -void __audit_socketcall(int nargs, unsigned long *args) +int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; + if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) + return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); + return 0; } /** -- cgit v1.2.3 From 17c6ee707a32c8e67861a442f387def5b7f64cec Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sun, 7 Apr 2013 16:14:18 +0600 Subject: auditsc: Use kzalloc instead of kmalloc+memset. In function audit_alloc_context(), use kzalloc, instead of kmalloc+memset. Patch also renames audit_zero_context() to audit_set_context(), to represent it's inner workings properly. Signed-off-by: Rakib Mullick Signed-off-by: Eric Paris --- kernel/auditsc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d57ad32db367..9dc3bae9793d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1048,10 +1048,9 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_zero_context(struct audit_context *context, +static inline void audit_set_context(struct audit_context *context, enum audit_state state) { - memset(context, 0, sizeof(*context)); context->state = state; context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; } @@ -1060,9 +1059,10 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return NULL; - audit_zero_context(context, state); + audit_set_context(context, state); INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; -- cgit v1.2.3 From e2c5adc88a0ffd4a715f630c3b83a1d5cbfd1cff Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Apr 2013 14:43:41 -0700 Subject: auditsc: remove audit_set_context() altogether - fold it into its caller > In function audit_alloc_context(), use kzalloc, instead of kmalloc+memset. Patch also renames audit_zero_context() to > audit_set_context(), to represent it's inner workings properly. Fair enough. I'd go futher... Cc: Al Viro Cc: Eric Paris Cc: Rakib Mullick Signed-off-by: Andrew Morton Signed-off-by: Eric Paris --- kernel/auditsc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9dc3bae9793d..b536d22fe56f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1048,13 +1048,6 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_set_context(struct audit_context *context, - enum audit_state state) -{ - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; @@ -1062,7 +1055,8 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return NULL; - audit_set_context(context, state); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; -- cgit v1.2.3 From f7616102d6f62d51cffb796d4672ad81fef00fea Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Apr 2013 11:25:00 -0400 Subject: audit: use data= not msg= for AUDIT_USER_TTY messages Userspace parsing libraries assume that msg= is only for userspace audit records, not for user tty records. Make this consistent with the other tty records. Reported-by: Steve Grubb Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 02a5d9eefa82..c45e6d2809d7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -755,7 +755,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) else { int size; - audit_log_format(ab, " msg="); + audit_log_format(ab, " data="); size = nlmsg_len(nlh); if (size > 0 && ((unsigned char *)data)[size - 1] == '\0') -- cgit v1.2.3 From ad395abece974e50cfd7ddd509a4faae8e238a40 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 23 Oct 2012 08:58:35 -0400 Subject: Audit: do not print error when LSMs disabled RHBZ: 785936 If the audit system collects a record about one process sending a signal to another process it includes in that collection the 'secid' or 'an int used to represet an LSM label.' If there is no LSM enabled it will collect a 0. The problem is that when we attempt to print that record we ask the LSM to convert the secid back to a string. Since there is no LSM it returns EOPNOTSUPP. Most code in the audit system checks if the secid is 0 and does not print LSM info in that case. The signal information code however forgot that check. Thus users will see a message in syslog indicating that converting the sid to string failed. Add the right check. Signed-off-by: Eric Paris --- kernel/auditsc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b536d22fe56f..67df4ee1d3b6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1205,12 +1205,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (security_secid_to_secctx(sid, &ctx, &len)) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + if (sid) { + if (security_secid_to_secctx(sid, &ctx, &len)) { + audit_log_format(ab, " obj=(none)"); + rc = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); -- cgit v1.2.3 From 72199caa8dc7f71d29328069b588340201ee73d7 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Fri, 12 Apr 2013 17:34:20 +0800 Subject: audit: remove duplicate export of audit_enabled audit_enabled has already been exported in include/linux/audit.h. and kernel/audit.h includes include/linux/audit.h, no need to export aduit_enabled again in kernel/audit.h Signed-off-by: Gao feng Signed-off-by: Eric Paris --- kernel/audit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1b..d06ffc144f81 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -60,7 +60,6 @@ struct audit_entry { }; #ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; #endif -- cgit v1.2.3 From 34c474de7b4bd451396d67647ac728b0433379a9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 16 Apr 2013 10:17:02 -0400 Subject: audit: fix build break when AUDIT_DEBUG == 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Looks like this one has been around since 5195d8e21: kernel/auditsc.c: In function ‘audit_free_names’: kernel/auditsc.c:998: error: ‘i’ undeclared (first use in this function) ...and this warning: kernel/auditsc.c: In function ‘audit_putname’: kernel/auditsc.c:2045: warning: ‘i’ may be used uninitialized in this function Signed-off-by: Jeff Layton Signed-off-by: Eric Paris --- kernel/auditsc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 67df4ee1d3b6..4baf61d39836 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1001,6 +1001,8 @@ static inline void audit_free_names(struct audit_context *context) #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { + int i = 0; + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -1009,7 +1011,7 @@ static inline void audit_free_names(struct audit_context *context) context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i, + printk(KERN_ERR "names[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } dump_stack(); @@ -2050,10 +2052,10 @@ void audit_putname(struct filename *name) __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; - int i; + int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i, + printk(KERN_ERR "name[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } #endif -- cgit v1.2.3 From 62062cf8a3a99a933efdac549da380f230dbe982 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 16 Apr 2013 13:08:43 -0400 Subject: audit: allow checking the type of audit message in the user filter When userspace sends messages to the audit system it includes a type. We want to be able to filter messages based on that type without have to do the all or nothing option currently available on the AUDIT_FILTER_TYPE filter list. Instead we should be able to use the AUDIT_FILTER_USER filter list and just use the message type as one part of the matching decision. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- kernel/auditfilter.c | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c45e6d2809d7..132271448b89 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -737,7 +737,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(); + err = audit_filter_user(msg_type); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06f..9e666004e0dc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -310,6 +310,18 @@ static u32 audit_to_op(u32 op) return n; } +/* check if a field is valid for a given list */ +static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) +{ + switch(f->type) { + case AUDIT_MSGTYPE: + if (entry->rule.listnr != AUDIT_FILTER_TYPE && + entry->rule.listnr != AUDIT_FILTER_USER) + return -EINVAL; + break; + }; + return 0; +} /* Translate struct audit_rule to kernel's rule respresentation. * Exists for backward compatibility with userspace. */ @@ -459,6 +471,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; + + err = audit_field_valid(entry, f); + if (err) + goto exit_free; + + err = -EINVAL; + switch(f->type) { case AUDIT_UID: case AUDIT_EUID: @@ -1354,7 +1373,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, int type, enum audit_state *state) { int i; @@ -1378,6 +1397,9 @@ static int audit_filter_user_rules(struct audit_krule *rule, result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; + case AUDIT_MSGTYPE: + result = audit_comparator(type, f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -1404,7 +1426,7 @@ static int audit_filter_user_rules(struct audit_krule *rule, return 1; } -int audit_filter_user(void) +int audit_filter_user(int type) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1412,7 +1434,7 @@ int audit_filter_user(void) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(&e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, type, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; -- cgit v1.2.3 From ab61d38ed8cf670946d12dc46b9198b521c790ea Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 16 Apr 2013 17:26:51 -0400 Subject: audit: make validity checking generic We have 2 interfaces to send audit rules. Rather than check validity of things in 2 places make a helper function. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 146 ++++++++++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9e666004e0dc..ff6e09d89278 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -310,7 +310,7 @@ static u32 audit_to_op(u32 op) return n; } -/* check if a field is valid for a given list */ +/* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { switch(f->type) { @@ -320,6 +320,69 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; break; }; + + switch(f->type) { + default: + return -EINVAL; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + case AUDIT_PID: + case AUDIT_PERS: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + return -EINVAL; + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + case AUDIT_WATCH: + case AUDIT_DIR: + case AUDIT_FILTERKEY: + break; + /* arch is only allowed to be = or != */ + case AUDIT_ARCH: + if (f->op != Audit_not_equal && f->op != Audit_equal) + return -EINVAL; + break; + case AUDIT_PERM: + if (f->val & ~15) + return -EINVAL; + break; + case AUDIT_FILETYPE: + if (f->val & ~S_IFMT) + return -EINVAL; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) + return -EINVAL; + break; + }; return 0; } @@ -361,18 +424,17 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) if (f->op == Audit_bad) goto exit_free; - switch(f->type) { - default: + err = audit_field_valid(entry, f); + if (err) goto exit_free; + + err = -EINVAL; + switch (f->type) { case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->uid = make_kuid(current_user_ns(), f->val); if (!uid_valid(f->uid)) goto exit_free; @@ -381,45 +443,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EGID: case AUDIT_SGID: case AUDIT_FSGID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->gid = make_kgid(current_user_ns(), f->val); if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - /* bit ops are only useful on syscall args */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - /* arch is only allowed to be = or != */ case AUDIT_ARCH: - if (f->op != Audit_not_equal && f->op != Audit_equal) - goto exit_free; entry->rule.arch_f = f; break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -477,18 +507,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; err = -EINVAL; - - switch(f->type) { + switch (f->type) { case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_LOGINUID: case AUDIT_OBJ_UID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->uid = make_kuid(current_user_ns(), f->val); if (!uid_valid(f->uid)) goto exit_free; @@ -498,27 +523,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->gid = make_kgid(current_user_ns(), f->val); if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; case AUDIT_ARCH: entry->rule.arch_f = f; break; @@ -589,20 +597,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_FIELD_COMPARE: - if (f->val > AUDIT_MAX_FIELD_COMPARE) - goto exit_free; - break; - default: - goto exit_free; } } -- cgit v1.2.3 From 18900909163758baf2152c9102b1a0953f7f1c30 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 18 Apr 2013 19:16:36 -0400 Subject: audit: remove the old depricated kernel interface We used to have an inflexible mechanism to add audit rules to the kernel. It hasn't been used in a long time. Get rid of that stuff. Signed-off-by: Eric Paris --- kernel/audit.c | 28 ++------- kernel/auditfilter.c | 160 +-------------------------------------------------- 2 files changed, 8 insertions(+), 180 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 132271448b89..274882d308d3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -597,13 +597,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EPERM; switch (msg_type) { - case AUDIT_GET: case AUDIT_LIST: - case AUDIT_LIST_RULES: - case AUDIT_SET: case AUDIT_ADD: - case AUDIT_ADD_RULE: case AUDIT_DEL: + return -EOPNOTSUPP; + case AUDIT_GET: + case AUDIT_SET: + case AUDIT_LIST_RULES: + case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: @@ -766,25 +767,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); } break; - case AUDIT_ADD: - case AUDIT_DEL: - if (nlmsg_len(nlh) < sizeof(struct audit_rule)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ff6e09d89278..ee9af6533327 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -386,89 +386,6 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule to kernel's rule respresentation. - * Exists for backward compatibility with userspace. */ -static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) -{ - struct audit_entry *entry; - int err = 0; - int i; - - entry = audit_to_entry_common(rule); - if (IS_ERR(entry)) - goto exit_nofree; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - u32 n; - - n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (n & AUDIT_NEGATE) - f->op = Audit_not_equal; - else if (!n) - f->op = Audit_equal; - else - f->op = audit_to_op(n); - - entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - - f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); - f->val = rule->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - - err = -EINVAL; - if (f->op == Audit_bad) - goto exit_free; - - err = audit_field_valid(entry, f); - if (err) - goto exit_free; - - err = -EINVAL; - switch (f->type) { - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_LOGINUID: - f->uid = make_kuid(current_user_ns(), f->val); - if (!uid_valid(f->uid)) - goto exit_free; - break; - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - f->gid = make_kgid(current_user_ns(), f->val); - if (!gid_valid(f->gid)) - goto exit_free; - break; - case AUDIT_ARCH: - entry->rule.arch_f = f; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - } - } - - if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) - entry->rule.inode_f = NULL; - -exit_nofree: - return entry; - -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); -} - /* Translate struct audit_rule_data to kernel's rule respresentation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) @@ -622,36 +539,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule. - * Exists for backward compatibility with userspace. */ -static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) -{ - struct audit_rule *rule; - int i; - - rule = kzalloc(sizeof(*rule), GFP_KERNEL); - if (unlikely(!rule)) - return NULL; - - rule->flags = krule->flags | krule->listnr; - rule->action = krule->action; - rule->field_count = krule->field_count; - for (i = 0; i < rule->field_count; i++) { - rule->values[i] = krule->fields[i].val; - rule->fields[i] = krule->fields[i].type; - - if (krule->vers_ops == 1) { - if (krule->fields[i].op == Audit_not_equal) - rule->fields[i] |= AUDIT_NEGATE; - } else { - rule->fields[i] |= audit_ops[krule->fields[i].op]; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; - - return rule; -} - /* Translate kernel rule respresentation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { @@ -1064,35 +951,6 @@ out: return ret; } -/* List rules using struct audit_rule. Exists for backward - * compatibility with userspace. */ -static void audit_list(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_krule *r; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; iq); mutex_lock(&audit_filter_mutex); - if (type == AUDIT_LIST) - audit_list(pid, seq, &dest->q); - else - audit_list_rules(pid, seq, &dest->q); + audit_list_rules(pid, seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); @@ -1201,12 +1055,8 @@ int audit_receive_filter(int type, int pid, int seq, void *data, err = PTR_ERR(tsk); } break; - case AUDIT_ADD: case AUDIT_ADD_RULE: - if (type == AUDIT_ADD) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); @@ -1217,12 +1067,8 @@ int audit_receive_filter(int type, int pid, int seq, void *data, if (err) audit_free_rule(entry); break; - case AUDIT_DEL: case AUDIT_DEL_RULE: - if (type == AUDIT_DEL) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); -- cgit v1.2.3 From dc9eb698f441889f2d7926b1cc6f1e14f0787f00 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 19 Apr 2013 13:23:09 -0400 Subject: audit: stop pushing loginid, uid, sessionid as arguments We always use current. Stop pulling this when the skb comes in and pushing it around as arguments. Just get it at the end when you need it. Signed-off-by: Eric Paris --- kernel/audit.c | 100 +++++++++++++++++++-------------------------------- kernel/auditfilter.c | 22 +++++------- 2 files changed, 46 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 274882d308d3..bf1e1330cbb1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -265,17 +265,22 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; int rc = 0; + u32 sessionid = audit_get_sessionid(current); + uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + u32 sid; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, from_kuid(&init_user_ns, loginuid), sessionid); + old, auid, sessionid); + + security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; u32 len; @@ -294,9 +299,7 @@ static int audit_log_config_change(char *function_name, int new, int old, return rc; } -static int audit_do_config_change(char *function_name, int *to_change, - int new, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_do_config_change(char *function_name, int *to_change, int new) { int allow_changes, rc = 0, old = *to_change; @@ -307,8 +310,7 @@ static int audit_do_config_change(char *function_name, int *to_change, allow_changes = 1; if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, loginuid, - sessionid, sid, allow_changes); + rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } @@ -322,44 +324,37 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_rate_limit(int limit) { - return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } -static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_backlog_limit(int limit) { - return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } -static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) return -EINVAL; - rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sessionid, sid); - + rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } -static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; - return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sessionid, sid); + return audit_do_config_change("audit_failure", &audit_failure, state); } /* @@ -627,12 +622,15 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - kuid_t auid, u32 ses, u32 sid) +static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; char *ctx = NULL; u32 len; + u32 sessionid = audit_get_sessionid(current); + uid_t uid = from_kuid(&init_user_ns, current_uid()); + uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + u32 sid; if (!audit_enabled) { *ab = NULL; @@ -643,9 +641,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, if (unlikely(!*ab)) return rc; audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - task_tgid_vnr(current), - from_kuid(&init_user_ns, current_uid()), - from_kuid(&init_user_ns, auid), ses); + task_tgid_vnr(current), uid, auid, sessionid); + security_task_getsecid(current, &sid); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -661,14 +658,12 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 seq, sid; + u32 seq; void *data; struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - kuid_t loginuid; /* loginuid of sender */ - u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; @@ -677,9 +672,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - loginuid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -700,14 +692,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, - loginuid, sessionid, sid); + err = audit_set_enabled(status_get->enabled); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, - loginuid, sessionid, sid); + err = audit_set_failure(status_get->failure); if (err < 0) return err; } @@ -715,22 +705,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int new_pid = status_get->pid; if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - audit_pid, loginuid, - sessionid, sid, 1); - + audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sessionid, sid); + err = audit_set_rate_limit(status_get->rate_limit); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sessionid, sid); + err = audit_set_backlog_limit(status_get->backlog_limit); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: @@ -742,14 +727,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_task(current, loginuid, - sessionid); + err = tty_audit_push_task(current); if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.1024s'", (char *)data); @@ -772,26 +754,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); audit_log_end(ab); return -EPERM; } /* fallthrough */ case AUDIT_LIST_RULES: err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); + seq, data, nlmsg_len(nlh)); break; case AUDIT_TRIM: audit_trim_trees(); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -821,8 +796,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ee9af6533327..f952234da2ca 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -980,11 +980,12 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, - char *action, struct audit_krule *rule, - int res) +static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; + uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + u32 sessionid = audit_get_sessionid(current); + u32 sid; if (!audit_enabled) return; @@ -992,8 +993,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", - from_kuid(&init_user_ns, loginuid), sessionid); + audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); + security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; u32 len; @@ -1022,8 +1023,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int seq, void *data, - size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1061,9 +1061,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add rule", - &entry->rule, !err); - + audit_log_rule_change("add rule", &entry->rule, !err); if (err) audit_free_rule(entry); break; @@ -1073,9 +1071,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove rule", - &entry->rule, !err); - + audit_log_rule_change("remove rule", &entry->rule, !err); audit_free_rule(entry); break; default: -- cgit v1.2.3 From 152f497b9b5940f81de3205465840a5eb316458e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 19 Apr 2013 13:56:11 -0400 Subject: audit: push loginuid and sessionid processing down Since we are always current, we can push a lot of this stuff to the bottom and get rid of useless interfaces and arguments. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index bf1e1330cbb1..79b42fd14c22 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -727,7 +727,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_task(current); + err = tty_audit_push_current(); if (err) break; } -- cgit v1.2.3 From b122c3767c1d89763b4babca062c3171a71ed97c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 19 Apr 2013 15:00:33 -0400 Subject: audit: use a consistent audit helper to log lsm information We have a number of places we were reimplementing the same code to write out lsm labels. Just do it one darn place. Signed-off-by: Eric Paris --- kernel/audit.c | 34 ++++------------------------------ kernel/auditfilter.c | 13 +------------ kernel/auditsc.c | 10 +++++----- 3 files changed, 10 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 79b42fd14c22..a3c77b979b5b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -271,29 +271,15 @@ static int audit_log_config_change(char *function_name, int new, int old, int rc = 0; u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - u32 sid; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, old, auid, sessionid); - - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; - - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) { - audit_log_format(ab, " sid=%u", sid); - allow_changes = 0; /* Something weird, deny request */ - } else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + rc = audit_log_task_context(ab); + if (rc) + allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; @@ -625,12 +611,9 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; - char *ctx = NULL; - u32 len; u32 sessionid = audit_get_sessionid(current); uid_t uid = from_kuid(&init_user_ns, current_uid()); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - u32 sid; if (!audit_enabled) { *ab = NULL; @@ -642,16 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return rc; audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), uid, auid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) - audit_log_format(*ab, " ssid=%u", sid); - else { - audit_log_format(*ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_task_context(*ab); return rc; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f952234da2ca..478f4602c96b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -985,7 +985,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re struct audit_buffer *ab; uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); u32 sessionid = audit_get_sessionid(current); - u32 sid; if (!audit_enabled) return; @@ -994,17 +993,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re if (!ab) return; audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_task_context(ab); audit_log_format(ab, " op="); audit_log_string(ab, action); audit_log_key(ab, rule->filterkey); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4baf61d39836..17e9a260a545 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1109,7 +1109,7 @@ static inline void audit_free_context(struct audit_context *context) kfree(context); } -void audit_log_task_context(struct audit_buffer *ab) +int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; unsigned len; @@ -1118,22 +1118,22 @@ void audit_log_task_context(struct audit_buffer *ab) security_task_getsecid(current, &sid); if (!sid) - return; + return 0; error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; - return; + return 0; } audit_log_format(ab, " subj=%s", ctx); security_release_secctx(ctx, len); - return; + return 0; error_path: audit_panic("error in audit_log_task_context"); - return; + return error; } EXPORT_SYMBOL(audit_log_task_context); -- cgit v1.2.3 From 4d3fb709b285ac885c40950a837edbfc90029c5f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 30 Apr 2013 09:53:34 -0400 Subject: helper for some session id stuff --- kernel/audit.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a3c77b979b5b..44803f25b236 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -269,14 +269,12 @@ static int audit_log_config_change(char *function_name, int new, int old, { struct audit_buffer *ab; int rc = 0; - u32 sessionid = audit_get_sessionid(current); - uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, auid, sessionid); + audit_log_format(ab, "%s=%d old=%d", function_name, new, old); + audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) allow_changes = 0; /* Something weird, deny request */ @@ -611,9 +609,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; - u32 sessionid = audit_get_sessionid(current); uid_t uid = from_kuid(&init_user_ns, current_uid()); - uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); if (!audit_enabled) { *ab = NULL; @@ -623,8 +619,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - task_tgid_vnr(current), uid, auid, sessionid); + audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_session_info(*ab); audit_log_task_context(*ab); return rc; @@ -1376,6 +1372,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_session_info(struct audit_buffer *ab) +{ + u32 sessionid = audit_get_sessionid(current); + uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + + audit_log_format(ab, "auid=%u ses=%u\n", auid, sessionid); +} + void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); -- cgit v1.2.3 From bde02ca858448cf54a4226774dd1481f3bcc455e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 30 Apr 2013 11:01:14 -0400 Subject: audit: use spin_lock_irqsave/restore in audit tty code Some of the callers of the audit tty function use spin_lock_irqsave/restore. We were using the forced always enable version, which seems really bad. Since I don't know every one of these code paths well enough, it makes sense to just switch everything to the safe version. Maybe it's a little overzealous, but it's a lot better than an unlucky deadlock when we return to a caller with irq enabled and they expect it to be disabled. Signed-off-by: Eric Paris --- kernel/audit.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 44803f25b236..241aa8593fa8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -804,10 +804,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk = current; + unsigned long flags; - spin_lock_irq(&tsk->sighand->siglock); + spin_lock_irqsave(&tsk->sighand->siglock, flags); s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); @@ -816,6 +817,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk = current; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; @@ -823,9 +825,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - spin_lock_irq(&tsk->sighand->siglock); + spin_lock_irqsave(&tsk->sighand->siglock, flags); tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); break; } default: -- cgit v1.2.3 From 46e959ea2969cc1668d09b0dc55226946cf781f1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 3 May 2013 14:03:50 -0400 Subject: audit: add an option to control logging of passwords with pam_tty_audit Most commands are entered one line at a time and processed as complete lines in non-canonical mode. Commands that interactively require a password, enter canonical mode to do this while shutting off echo. This pair of features (icanon and !echo) can be used to avoid logging passwords by audit while still logging the rest of the command. Adding a member (log_passwd) to the struct audit_tty_status passed in by pam_tty_audit allows control of canonical mode without echo per task. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 241aa8593fa8..998a0d4155cf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -808,6 +809,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) spin_lock_irqsave(&tsk->sighand->siglock, flags); s.enabled = tsk->signal->audit_tty != 0; + s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock_irqrestore(&tsk->sighand->siglock, flags); audit_send_reply(NETLINK_CB(skb).portid, seq, @@ -815,18 +817,20 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_TTY_SET: { - struct audit_tty_status *s; + struct audit_tty_status s; struct task_struct *tsk = current; unsigned long flags; - if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) - return -EINVAL; - s = data; - if (s->enabled != 0 && s->enabled != 1) + memset(&s, 0, sizeof(s)); + /* guard against past and future API changes */ + memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + if ((s.enabled != 0 && s.enabled != 1) || + (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; spin_lock_irqsave(&tsk->sighand->siglock, flags); - tsk->signal->audit_tty = s->enabled != 0; + tsk->signal->audit_tty = s.enabled; + tsk->signal->audit_tty_log_passwd = s.log_passwd; spin_unlock_irqrestore(&tsk->sighand->siglock, flags); break; } -- cgit v1.2.3 From 7173c54e3a9deb491a586e7e107375109ee48bcb Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 30 Apr 2013 11:28:04 -0400 Subject: audit: use spin_lock in audit_receive_msg to process tty logging This function is called when we receive a netlink message from userspace. We don't need to worry about it coming from irq context or irqs making it re-entrant. Signed-off-by: Eric Paris --- kernel/audit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 998a0d4155cf..d308723d22da 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -805,12 +805,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk = current; - unsigned long flags; - spin_lock_irqsave(&tsk->sighand->siglock, flags); + spin_lock(&tsk->sighand->siglock); s.enabled = tsk->signal->audit_tty != 0; s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_unlock(&tsk->sighand->siglock); audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); @@ -819,7 +818,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_SET: { struct audit_tty_status s; struct task_struct *tsk = current; - unsigned long flags; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -828,10 +826,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; - spin_lock_irqsave(&tsk->sighand->siglock, flags); + spin_lock(&tsk->sighand->siglock); tsk->signal->audit_tty = s.enabled; tsk->signal->audit_tty_log_passwd = s.log_passwd; - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_unlock(&tsk->sighand->siglock); break; } default: -- cgit v1.2.3 From b24a30a7305418ff138ff51776fc555ec57c011a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 30 Apr 2013 15:30:32 -0400 Subject: audit: fix event coverage of AUDIT_ANOM_LINK The userspace audit tools didn't like the existing formatting of the AUDIT_ANOM_LINK event. It needed to be expanded to emit an AUDIT_PATH event as well, so this implements the change. The bulk of the patch is moving code out of auditsc.c into audit.c and audit.h for general use. It expands audit_log_name to include an optional "struct path" argument for the simple case of just needing to report a pathname. This also makes audit_log_task_info available when syscall auditing is not enabled, since it is needed in either case for process details. Signed-off-by: Kees Cook Reported-by: Steve Grubb --- kernel/audit.c | 244 ++++++++++++++++++++++++++++++++++++-- kernel/audit.h | 157 +++++++++++++++++++++++++ kernel/auditsc.c | 353 +------------------------------------------------------ 3 files changed, 394 insertions(+), 360 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d308723d22da..8cc580316948 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -1393,6 +1394,224 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", + cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); +} + +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); +} + +/** + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +void audit_log_name(struct audit_context *context, struct audit_names *n, + struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + +int audit_log_task_context(struct audit_buffer *ab) +{ + char *ctx = NULL; + unsigned len; + int error; + u32 sid; + + security_task_getsecid(current, &sid); + if (!sid) + return 0; + + error = security_secid_to_secctx(sid, &ctx, &len); + if (error) { + if (error != -EINVAL) + goto error_path; + return 0; + } + + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + return 0; + +error_path: + audit_panic("error in audit_log_task_context"); + return error; +} +EXPORT_SYMBOL(audit_log_task_context); + +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +{ + const struct cred *cred; + char name[sizeof(tsk->comm)]; + struct mm_struct *mm = tsk->mm; + char *tty; + + if (!ab) + return; + + /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + from_kuid(&init_user_ns, audit_get_loginuid(tsk)), + from_kuid(&init_user_ns, cred->uid), + from_kgid(&init_user_ns, cred->gid), + from_kuid(&init_user_ns, cred->euid), + from_kuid(&init_user_ns, cred->suid), + from_kuid(&init_user_ns, cred->fsuid), + from_kgid(&init_user_ns, cred->egid), + from_kgid(&init_user_ns, cred->sgid), + from_kgid(&init_user_ns, cred->fsgid), + audit_get_sessionid(tsk), tty); + + get_task_comm(name, tsk); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); + + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); + up_read(&mm->mmap_sem); + } + audit_log_task_context(ab); +} +EXPORT_SYMBOL(audit_log_task_info); + /** * audit_log_link_denied - report a link restriction denial * @operation: specific link opreation @@ -1401,19 +1620,28 @@ void audit_log_key(struct audit_buffer *ab, char *key) void audit_log_link_denied(const char *operation, struct path *link) { struct audit_buffer *ab; + struct audit_names *name; + + name = kzalloc(sizeof(*name), GFP_NOFS); + if (!name) + return; + /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - return; - audit_log_format(ab, "op=%s action=denied", operation); - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_d_path(ab, " path=", link); - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + goto out; + audit_log_format(ab, "op=%s", operation); + audit_log_task_info(ab, current); + audit_log_format(ab, " res=0"); audit_log_end(ab); + + /* Generate AUDIT_PATH record with object. */ + name->type = AUDIT_TYPE_NORMAL; + audit_copy_inode(name, link->dentry, link->dentry->d_inode); + audit_log_name(current->audit_context, name, link, 0, NULL); +out: + kfree(name); } /** diff --git a/kernel/audit.h b/kernel/audit.h index d06ffc144f81..45c8325de5bb 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -22,6 +22,7 @@ #include #include #include +#include /* 0 = no checking 1 = put_count checking @@ -29,6 +30,11 @@ */ #define AUDIT_DEBUG 0 +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 + /* At task start time, the audit_state is set in the audit_context using a per-task filter. At syscall entry, the audit_state is augmented by the syscall filter. */ @@ -59,8 +65,159 @@ struct audit_entry { struct audit_krule rule; }; +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of file cap */ + kernel_cap_t effective; /* effective set of process */ + }; +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ +struct audit_names { + struct list_head list; /* audit_context->names_list */ + + struct filename *name; + int name_len; /* number of chars to log */ + bool name_put; /* call __putname()? */ + + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + unsigned char type; /* record type */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit. + */ + bool should_free; +}; + +/* The per-task audit context. */ +struct audit_context { + int dummy; /* must be the first element */ + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state, current_state; + unsigned int serial; /* serial number for record */ + int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ + unsigned long argv[4]; /* syscall arguments */ + long return_code;/* syscall return code */ + u64 prio; + int return_valid; /* return code is valid */ + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* struct audit_names->list anchor */ + char *filterkey; /* key for rule that triggered record */ + struct path pwd; + struct audit_aux_data *aux; + struct audit_aux_data *aux_pids; + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; + /* Save things to print about task_struct */ + pid_t pid, ppid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; + unsigned long personality; + int arch; + + pid_t target_pid; + kuid_t target_auid; + kuid_t target_uid; + unsigned int target_sessionid; + u32 target_sid; + char target_comm[TASK_COMM_LEN]; + + struct audit_tree_refs *trees, *first_trees; + struct list_head killed_trees; + int tree_count; + + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + kuid_t uid; + kgid_t gid; + umode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + umode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + umode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + struct { + int fd; + int flags; + } mmap; + }; + int fds[2]; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + #ifdef CONFIG_AUDIT +extern int audit_enabled; extern int audit_ever_enabled; + +extern void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + const struct inode *inode); +extern void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap); +extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); +extern void audit_log_name(struct audit_context *context, + struct audit_names *n, struct path *path, + int record_num, int *call_panic); #endif extern int audit_pid; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17e9a260a545..add3086bdb02 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -76,11 +76,6 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). If we get more names we will allocate - * a name dynamically and also add those to the list anchored by names_list. */ -#define AUDIT_NAMES 5 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -90,44 +85,6 @@ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; -struct audit_cap_data { - kernel_cap_t permitted; - kernel_cap_t inheritable; - union { - unsigned int fE; /* effective bit of a file capability */ - kernel_cap_t effective; /* effective set of a process */ - }; -}; - -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). - * - * Further, in fs/namei.c:path_lookup() we store the inode and device. - */ -struct audit_names { - struct list_head list; /* audit_context->names_list */ - struct filename *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - unsigned char type; /* record type */ - bool name_put; /* call __putname() for this name */ - /* - * This was an allocated audit_names and not from the array of - * names allocated in the task audit context. Thus this name - * should be freed on syscall exit - */ - bool should_free; -}; - struct audit_aux_data { struct audit_aux_data *next; int type; @@ -175,106 +132,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -/* The per-task audit context. */ -struct audit_context { - int dummy; /* must be the first element */ - int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ - int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ - unsigned long argv[4]; /* syscall arguments */ - long return_code;/* syscall return code */ - u64 prio; - int return_valid; /* return code is valid */ - /* - * The names_list is the list of all audit_names collected during this - * syscall. The first AUDIT_NAMES entries in the names_list will - * actually be from the preallocated_names array for performance - * reasons. Except during allocation they should never be referenced - * through the preallocated_names array and should only be found/used - * by running the names_list. - */ - struct audit_names preallocated_names[AUDIT_NAMES]; - int name_count; /* total records in names_list */ - struct list_head names_list; /* anchor for struct audit_names->list */ - char * filterkey; /* key for rule that triggered record */ - struct path pwd; - struct audit_aux_data *aux; - struct audit_aux_data *aux_pids; - struct sockaddr_storage *sockaddr; - size_t sockaddr_len; - /* Save things to print about task_struct */ - pid_t pid, ppid; - kuid_t uid, euid, suid, fsuid; - kgid_t gid, egid, sgid, fsgid; - unsigned long personality; - int arch; - - pid_t target_pid; - kuid_t target_auid; - kuid_t target_uid; - unsigned int target_sessionid; - u32 target_sid; - char target_comm[TASK_COMM_LEN]; - - struct audit_tree_refs *trees, *first_trees; - struct list_head killed_trees; - int tree_count; - - int type; - union { - struct { - int nargs; - long args[AUDITSC_ARGS]; - } socketcall; - struct { - kuid_t uid; - kgid_t gid; - umode_t mode; - u32 osid; - int has_perm; - uid_t perm_uid; - gid_t perm_gid; - umode_t perm_mode; - unsigned long qbytes; - } ipc; - struct { - mqd_t mqdes; - struct mq_attr mqstat; - } mq_getsetattr; - struct { - mqd_t mqdes; - int sigev_signo; - } mq_notify; - struct { - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; - } mq_sendrecv; - struct { - int oflag; - umode_t mode; - struct mq_attr attr; - } mq_open; - struct { - pid_t pid; - struct audit_cap_data cap; - } capset; - struct { - int fd; - int flags; - } mmap; - }; - int fds[2]; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif -}; - static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); @@ -1109,88 +966,6 @@ static inline void audit_free_context(struct audit_context *context) kfree(context); } -int audit_log_task_context(struct audit_buffer *ab) -{ - char *ctx = NULL; - unsigned len; - int error; - u32 sid; - - security_task_getsecid(current, &sid); - if (!sid) - return 0; - - error = security_secid_to_secctx(sid, &ctx, &len); - if (error) { - if (error != -EINVAL) - goto error_path; - return 0; - } - - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - return 0; - -error_path: - audit_panic("error in audit_log_task_context"); - return error; -} - -EXPORT_SYMBOL(audit_log_task_context); - -void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) -{ - const struct cred *cred; - char name[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; - char *tty; - - if (!ab) - return; - - /* tsk == current */ - cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - - - audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", - sys_getppid(), - tsk->pid, - from_kuid(&init_user_ns, tsk->loginuid), - from_kuid(&init_user_ns, cred->uid), - from_kgid(&init_user_ns, cred->gid), - from_kuid(&init_user_ns, cred->euid), - from_kuid(&init_user_ns, cred->suid), - from_kuid(&init_user_ns, cred->fsuid), - from_kgid(&init_user_ns, cred->egid), - from_kgid(&init_user_ns, cred->sgid), - from_kgid(&init_user_ns, cred->fsgid), - tsk->sessionid, tty); - - get_task_comm(name, tsk); - audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); - - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } - audit_log_task_context(ab); -} - -EXPORT_SYMBOL(audit_log_task_info); - static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) @@ -1408,35 +1183,6 @@ static void audit_log_execve_info(struct audit_context *context, kfree(buf); } -static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); - } -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); -} - static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1534,68 +1280,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } -static void audit_log_name(struct audit_context *context, struct audit_names *n, - int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", record_num); - - if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); -} - static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1713,7 +1397,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts i = 0; list_for_each_entry(n, &context->names_list, list) - audit_log_name(context, n, i++, &call_panic); + audit_log_name(context, n, NULL, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -2078,41 +1762,6 @@ void audit_putname(struct filename *name) #endif } -static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - - return 0; -} - - -/* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited -- cgit v1.2.3 From 2228768885e0b92c0f7b276cc61b8974e7aed724 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 3 May 2013 11:16:18 -0400 Subject: ring-buffer: Select IRQ_WORK As the wake up logic for waiters on the buffer has been moved from the tracing code to the ring buffer, it requires also adding IRQ_WORK as the wake up code is performed via irq_work. This fixes compile breakage when a user of the ring buffer is selected but tracing and irq_work are not. Link http://lkml.kernel.org/r/20130503115332.GT8356@rric.localhost Cc: Arnd Bergmann Reported-by: Robert Richter Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5e9efd4b83a4..015f85aaca08 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -71,6 +71,7 @@ config TRACE_CLOCK config RING_BUFFER bool select TRACE_CLOCK + select IRQ_WORK config FTRACE_NMI_ENTER bool @@ -107,7 +108,6 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select IRQ_WORK config GENERIC_TRACER bool -- cgit v1.2.3 From fbd44a607a1a5019bc32c3615cead8c5ee8f89c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 20:22:36 +0200 Subject: tick: Use zalloc_cpumask_var for allocating offstack cpumasks commit b352bc1cbc (tick: Convert broadcast cpu bitmaps to cpumask_var_t) broke CONFIG_CPUMASK_OFFSTACK in a very subtle way. Instead of allocating the cpumasks with zalloc_cpumask_var it uses alloc_cpumask_var, so we can get random data there, which of course confuses the logic completely and causes random failures. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Yinghai Lu Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305032015060.2990@ionos Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2f..d70cdc42c829 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -785,11 +785,11 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { - alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); - alloc_cpumask_var(&tmpmask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT - alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif } -- cgit v1.2.3 From 524eff183f51d080a83b348d0ea97c08b3607b9a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:17 +0200 Subject: perf: Fix EXIT event notification The perf_event_task_ctx() function needs to be called with preemption disabled, since it's checking for currently scheduled cpu against event cpu. We disable preemption for task related perf event context if there's one defined, leaving up to the chance which cpu it gets scheduled in. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b41c1899a8b..38b68a05c3c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4474,7 +4474,7 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; struct pmu *pmu; int ctxn; @@ -4485,20 +4485,22 @@ static void perf_event_task_event(struct perf_task_event *task_event) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); - } + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } - if (task_event->task_ctx) - perf_event_task_ctx(task_event->task_ctx, task_event); + if (task_ctx) { + preempt_disable(); + perf_event_task_ctx(task_ctx, task_event); + preempt_enable(); + } rcu_read_unlock(); } -- cgit v1.2.3 From 52d857a8784a09576215c71cebf368d61c12a754 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:18 +0200 Subject: perf: Factor out auxiliary events notification Add perf_event_aux() function to send out all types of auxiliary events - mmap, task, comm events. For each type there's match and output functions defined and used as callbacks during perf_event_aux processing. This way we can centralize the pmu/context iterating and event matching logic. Also since lot of the code was duplicated, this patch reduces the .text size about 2kB on my setup: snipped output from 'objdump -x kernel/events/core.o' before: Idx Name Size 0 .text 0000d313 after: Idx Name Size 0 .text 0000cad3 Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 242 +++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 38b68a05c3c6..9dc297faf7c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4394,6 +4394,64 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } +typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); +typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); + +static void +perf_event_aux_ctx(struct perf_event_context *ctx, + perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + if (match(event, data)) + output(event, data); + } +} + +static void +perf_event_aux(perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data, + struct perf_event_context *task_ctx) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + goto next; + perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_aux_ctx(ctx, match, output, data); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + + if (task_ctx) { + preempt_disable(); + perf_event_aux_ctx(task_ctx, match, output, data); + preempt_enable(); + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -4416,8 +4474,9 @@ struct perf_task_event { }; static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) + void *data) { + struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; @@ -4445,64 +4504,11 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) +static int perf_event_task_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - if (task_ctx) - goto next; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - if (task_ctx) { - preempt_disable(); - perf_event_task_ctx(task_ctx, task_event); - preempt_enable(); - } - - rcu_read_unlock(); + return event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task; } static void perf_event_task(struct task_struct *task, @@ -4533,7 +4539,10 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_task_event(&task_event); + perf_event_aux(perf_event_task_match, + perf_event_task_output, + &task_event, + task_ctx); } void perf_event_fork(struct task_struct *task) @@ -4559,8 +4568,9 @@ struct perf_comm_event { }; static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) + void *data) { + struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; @@ -4587,39 +4597,16 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) +static int perf_event_comm_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } + return event->attr.comm; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; char comm[TASK_COMM_LEN]; unsigned int size; - struct pmu *pmu; - int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -4629,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_comm_match, + perf_event_comm_output, + comm_event, + NULL); } void perf_event_comm(struct task_struct *task) @@ -4708,8 +4682,9 @@ struct perf_mmap_event { }; static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) + void *data) { + struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; @@ -4736,46 +4711,24 @@ out: } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) + void *data) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } + return (!executable && event->attr.mmap_data) || + (executable && event->attr.mmap); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; const char *name; - struct pmu *pmu; - int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -4831,27 +4784,10 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_mmap_match, + perf_event_mmap_output, + mmap_event, + NULL); kfree(buf); } -- cgit v1.2.3 From 780a7654cee8d61819512385e778e4827db4bfbc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 9 Apr 2013 02:22:10 -0700 Subject: audit: Make testing for a valid loginuid explicit. audit rule additions containing "-F auid!=4294967295" were failing with EINVAL because of a regression caused by e1760bd. Apparently some userland audit rule sets want to know if loginuid uid has been set and are using a test for auid != 4294967295 to determine that. In practice that is a horrible way to ask if a value has been set, because it relies on subtle implementation details and will break every time the uid implementation in the kernel changes. So add a clean way to test if the audit loginuid has been set, and silently convert the old idiom to the cleaner and more comprehensible new idiom. Cc: # 3.7 Reported-By: Richard Guy Briggs Signed-off-by: "Eric W. Biederman" Tested-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 17 +++++++++++++++-- kernel/auditsc.c | 5 ++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 478f4602c96b..bc6595fe952e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -365,7 +365,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_DIR: case AUDIT_FILTERKEY: break; - /* arch is only allowed to be = or != */ + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + /* FALL THROUGH */ case AUDIT_ARCH: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; @@ -419,17 +422,23 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_str = NULL; f->lsm_rule = NULL; + /* Support legacy tests for a valid loginuid */ + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + f->type = AUDIT_LOGINUID_SET; + f->val = 0; + } + err = audit_field_valid(entry, f); if (err) goto exit_free; err = -EINVAL; switch (f->type) { + case AUDIT_LOGINUID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: - case AUDIT_LOGINUID: case AUDIT_OBJ_UID: f->uid = make_kuid(current_user_ns(), f->val); if (!uid_valid(f->uid)) @@ -1222,6 +1231,10 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; case AUDIT_MSGTYPE: result = audit_comparator(type, f->op, f->val); break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index add3086bdb02..3c8a601324a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -613,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -1970,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid) unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (uid_valid(task->loginuid)) + if (audit_loginuid_set(task)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) -- cgit v1.2.3 From 82d8da0d46ae7d3e9089efadb5e8d9841c20a431 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 7 May 2013 21:24:02 -0400 Subject: Revert "audit: move kaudit thread start from auditd registration to kaudit init" This reverts commit 6ff5e45985c2fcb97947818f66d1eeaf9d6600b2. Conflicts: kernel/audit.c This patch was starting a kthread for all the time. Since the follow on patches that required it didn't get finished in 3.10 time, we shouldn't ship this change in 3.10. Signed-off-by: Eric Paris --- kernel/audit.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8cc580316948..f9c6506536e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -644,6 +644,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; + /* As soon as there's any sign of userspace auditd, + * start kauditd to talk to it */ + if (!kauditd_task) + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } + seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -895,10 +905,6 @@ static int __init audit_init(void) else audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) - return PTR_ERR(kauditd_task); - skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = AUDIT_INITIALIZED; -- cgit v1.2.3 From 2a0b4be6dd655e24990da1d0811e28b9277f8b12 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 8 May 2013 00:01:07 -0400 Subject: audit: fix message spacing printing auid The helper function didn't include a leading space, so it was jammed against the previous text in the audit record. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f9c6506536e6..5c7e62ff4795 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1388,7 +1388,7 @@ void audit_log_session_info(struct audit_buffer *ab) u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, "auid=%u ses=%u\n", auid, sessionid); + audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) -- cgit v1.2.3 From a5b85bd1557209b4ef18a8cf07e60a1ca3132468 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:14 +0900 Subject: tracing: Don't succeed if event_enable_func did not register anything Return 0 instead of the number of activated ftrace function probes if event_enable_func succeeded and return an error code if it failed or did not register any functions. But it currently returns the number of registered functions and if it didn't register anything, it returns 0, but that is considered success. This also fixes the return value. As if it succeeds, it returns the number of functions that were enabled, which is returned back to the user in ftrace_regex_write (the write() return code). If only one function is enabled, then the return code of the write is one, and this can confuse the user program in thinking it only wrote 1 byte. Link: http://lkml.kernel.org/r/20130509054413.30398.55650.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu [ Rewrote change log to reflect that this fixes two bugs - SR ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53582e982e51..44ac83614c3d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2061,8 +2061,11 @@ event_enable_func(struct ftrace_hash *hash, if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, ops, data); - if (!ret) + if (!ret) { + ret = -ENOENT; goto out_disable; + } else + ret = 0; out: mutex_unlock(&event_mutex); return ret; -- cgit v1.2.3 From ff305ded9ff83436039a16d31bc558dc6598d7ce Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 11:30:26 -0400 Subject: tracing: Return error if register_ftrace_function_probe() fails for event_enable_func() register_ftrace_function_probe() returns the number of functions it registered, which can be zero, it can also return a negative number if something went wrong. But event_enable_func() only checks for the case that it didn't register anything, it needs to also check for the case that something went wrong and return that error code as well. Added some comments about the code as well, to make it more understandable. Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 44ac83614c3d..87e826f1c237 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2061,11 +2061,18 @@ event_enable_func(struct ftrace_hash *hash, if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, ops, data); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ if (!ret) { ret = -ENOENT; goto out_disable; - } else - ret = 0; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; out: mutex_unlock(&event_mutex); return ret; -- cgit v1.2.3 From 7c088b5120ffef017e2ddc38f992277e96436ef6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 11:35:12 -0400 Subject: ftrace: Have ftrace_regex_write() return either read or error As ftrace_regex_write() reads the result of ftrace_process_regex() which can sometimes return a positive number, only consider a failure if the return is negative. Otherwise, it will skip possible other registered probes and by returning a positive number that wasn't read, it will confuse the user processes doing the writing. Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8a5c017bb50c..d85a0ad81a67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3276,7 +3276,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); - if (ret) + if (ret < 0) goto out_unlock; } -- cgit v1.2.3 From 91c2e0bcae72a3086c698b5de2b950b885abb0e6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Mar 2013 20:10:59 -0500 Subject: unify compat fanotify_mark(2), switch to COMPAT_SYSCALL_DEFINE Signed-off-by: Al Viro --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bfd6787b355a..7078052284fd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -200,6 +200,7 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); +cond_syscall(compat_sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); -- cgit v1.2.3 From c5ddd2024a87353f73068732cfd38d3dfec22e87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Mar 2013 14:25:51 -0400 Subject: switch compat_sys_sysctl to COMPAT_SYSCALL_DEFINE Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ebf72358e86a..aea4a9ea6fc8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL_SYSCALL @@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) #ifdef CONFIG_COMPAT -#include struct compat_sysctl_args { compat_uptr_t name; @@ -1459,7 +1459,7 @@ struct compat_sysctl_args { compat_ulong_t __unused[4]; }; -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) { struct compat_sysctl_args tmp; compat_size_t __user *compat_oldlenp; -- cgit v1.2.3 From f04f24fb7e48d446bd89a01c6056571f25972511 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:17 +0900 Subject: ftrace, kprobes: Fix a deadlock on ftrace_regex_lock Fix a deadlock on ftrace_regex_lock which happens when setting an enable_event trigger on dynamic kprobe event as below. ---- sh-2.05b# echo p vfs_symlink > kprobe_events sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter ============================================= [ INFO: possible recursive locking detected ] 3.9.0+ #35 Not tainted --------------------------------------------- sh/72 is trying to acquire lock: (ftrace_regex_lock){+.+.+.}, at: [] ftrace_set_hash+0x81/0x1f0 but task is already holding lock: (ftrace_regex_lock){+.+.+.}, at: [] ftrace_regex_write.isra.29.part.30+0x3d/0x220 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(ftrace_regex_lock); lock(ftrace_regex_lock); *** DEADLOCK *** ---- To fix that, this introduces a finer regex_lock for each ftrace_ops. ftrace_regex_lock is too big of a lock which protects all filter/notrace_hash operations, but it doesn't need to be a global lock after supporting multiple ftrace_ops because each ftrace_ops has its own filter/notrace_hash. Link: http://lkml.kernel.org/r/20130509054417.30398.84254.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu [ Added initialization flag and automate mutex initialization for non ftrace.c ftrace_probes. ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 73 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d85a0ad81a67..827f2fe7bc3f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,6 +64,13 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_REGEX_LOCK(opsname) \ + .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#else +#define INIT_REGEX_LOCK(opsname) +#endif + static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, @@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); while (likely(op = rcu_dereference_raw((op)->next)) && \ unlikely((op) != &ftrace_list_end)) +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { + mutex_init(&ops->regex_lock); + ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif +} + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -907,7 +924,8 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1103,11 +1121,10 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; -static DEFINE_MUTEX(ftrace_regex_lock); - struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -1247,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { + ftrace_ops_init(ops); free_ftrace_hash(ops->filter_hash); free_ftrace_hash(ops->notrace_hash); } @@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; int ret = 0; + ftrace_ops_init(ops); + if (unlikely(ftrace_disabled)) return -ENODEV; @@ -2656,7 +2676,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } - mutex_lock(&ftrace_regex_lock); + mutex_lock(&ops->regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -2677,7 +2697,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } else file->private_data = iter; - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ops->regex_lock); return ret; } @@ -2910,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(trace_probe_ops) }; static int ftrace_probe_registered; @@ -3256,18 +3278,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_regex_lock); - - ret = -ENODEV; - if (unlikely(ftrace_disabled)) - goto out_unlock; - if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; } else iter = file->private_data; + mutex_lock(&iter->ops->regex_lock); + + ret = -ENODEV; + if (unlikely(ftrace_disabled)) + goto out_unlock; + parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); @@ -3282,7 +3304,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = read; out_unlock: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&iter->ops->regex_lock); return ret; } @@ -3344,7 +3366,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (!hash) return -ENOMEM; - mutex_lock(&ftrace_regex_lock); + mutex_lock(&ops->regex_lock); if (reset) ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { @@ -3366,7 +3388,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ops->regex_lock); free_ftrace_hash(hash); return ret; @@ -3392,6 +3414,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { + ftrace_ops_init(ops); return ftrace_set_addr(ops, ip, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); @@ -3416,6 +3439,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { + ftrace_ops_init(ops); return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3434,6 +3458,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { + ftrace_ops_init(ops); return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); @@ -3524,6 +3549,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; + ftrace_ops_init(ops); + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -3551,14 +3578,14 @@ int ftrace_regex_release(struct inode *inode, struct file *file) int filter_hash; int ret; - mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; - seq_release(inode, file); } else iter = file->private_data; + mutex_lock(&iter->ops->regex_lock); + parser = &iter->parser; if (trace_parser_loaded(parser)) { parser->buffer[parser->idx] = 0; @@ -3587,7 +3614,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) free_ftrace_hash(iter->hash); kfree(iter); - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&iter->ops->regex_lock); return 0; } @@ -4126,7 +4153,8 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4180,8 +4208,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, } static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(control_ops) }; static inline void @@ -4539,6 +4568,8 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret = -1; + ftrace_ops_init(ops); + mutex_lock(&ftrace_lock); ret = __register_ftrace_function(ops); -- cgit v1.2.3 From 3f2367ba7cbf13ec0f3f1e93b833a7eacd1ab4b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:21 +0900 Subject: ftrace: Cleanup regex_lock and ftrace_lock around hash updating Cleanup regex_lock and ftrace_lock locking points around ftrace_ops hash update code. The new rule is that regex_lock protects ops->*_hash read-update-write code for each ftrace_ops. Usually, hash update is done by following sequence. 1. allocate a new local hash and copy the original hash. 2. update the local hash. 3. move(actually, copy) back the local hash to ftrace_ops. 4. update ftrace entries if needed. 5. release the local hash. This makes regex_lock protect #1-#4, and ftrace_lock to protect #3, #4 and adding and removing ftrace_ops from the ftrace_ops_list. The ftrace_lock protects #3 as well because the move functions update the entries too. Link: http://lkml.kernel.org/r/20130509054421.30398.83411.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 59 ++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 827f2fe7bc3f..cacf0856191e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2656,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, return -ENOMEM; } + iter->ops = ops; + iter->flags = flag; + + mutex_lock(&ops->regex_lock); + if (flag & FTRACE_ITER_NOTRACE) hash = ops->notrace_hash; else hash = ops->filter_hash; - iter->ops = ops; - iter->flags = flag; - if (file->f_mode & FMODE_WRITE) { - mutex_lock(&ftrace_lock); iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); - mutex_unlock(&ftrace_lock); - if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); - return -ENOMEM; + ret = -ENOMEM; + goto out_unlock; } } - mutex_lock(&ops->regex_lock); - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_filter_reset(iter->hash); @@ -2697,6 +2695,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } else file->private_data = iter; + + out_unlock: mutex_unlock(&ops->regex_lock); return ret; @@ -3012,7 +3012,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { @@ -3070,14 +3070,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); + mutex_lock(&ftrace_lock); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); if (ret < 0) count = ret; __enable_ftrace_function_probe(); + mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&ftrace_lock); + mutex_unlock(&trace_probe_ops.regex_lock); free_ftrace_hash(hash); return count; @@ -3117,7 +3119,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3155,6 +3157,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, list_add(&entry->free_list, &free_list); } } + mutex_lock(&ftrace_lock); __disable_ftrace_function_probe(); /* * Remove after the disable is called. Otherwise, if the last @@ -3166,9 +3169,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, list_del(&entry->free_list); ftrace_free_entry(entry); } + mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&ftrace_lock); + mutex_unlock(&trace_probe_ops.regex_lock); free_ftrace_hash(hash); } @@ -3284,11 +3288,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - mutex_lock(&iter->ops->regex_lock); - - ret = -ENODEV; if (unlikely(ftrace_disabled)) - goto out_unlock; + return -ENODEV; + + /* iter->hash is a local copy, so we don't need regex_lock */ parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); @@ -3299,13 +3302,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out_unlock; + goto out; } ret = read; -out_unlock: - mutex_unlock(&iter->ops->regex_lock); - + out: return ret; } @@ -3357,16 +3358,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ops->regex_lock); + if (enable) orig_hash = &ops->filter_hash; else orig_hash = &ops->notrace_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - return -ENOMEM; + if (!hash) { + ret = -ENOMEM; + goto out_regex_unlock; + } - mutex_lock(&ops->regex_lock); if (reset) ftrace_filter_reset(hash); if (buf && !ftrace_match_records(hash, buf, len)) { @@ -3584,8 +3588,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) } else iter = file->private_data; - mutex_lock(&iter->ops->regex_lock); - parser = &iter->parser; if (trace_parser_loaded(parser)) { parser->buffer[parser->idx] = 0; @@ -3594,6 +3596,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); + mutex_lock(&iter->ops->regex_lock); + if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); @@ -3611,10 +3615,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } + + mutex_unlock(&iter->ops->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); - mutex_unlock(&iter->ops->regex_lock); return 0; } -- cgit v1.2.3 From cce2c8f26704529f592fc124c7c6ad399940dc5d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:24 +0900 Subject: tracing/kprobes: Fix to increment return event probe hit-count Fix to increment probe hit-count for function return event. Link: http://lkml.kernel.org/r/20130509054424.30398.34058.stgit@mhiramat-M0-7522 Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1865d5f76538..69286337fd7e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -767,6 +767,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; + tp->nhit++; + local_save_flags(irq_flags); pc = preempt_count(); -- cgit v1.2.3 From 30052170dcc256c18a43fb3e76577a67394543f8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:26 +0900 Subject: tracing: Indicate enabled soft-mode in enable file Indicate enabled soft-mode event as "1*" in "enable" file for each event, because it can be soft-disabled when disable_event trigger is hit. Link: http://lkml.kernel.org/r/20130509054426.30398.28202.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87e826f1c237..915c136d7bd1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -623,6 +623,8 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (file->flags & FTRACE_EVENT_FL_ENABLED) { if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) buf = "0*\n"; + else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + buf = "1*\n"; else buf = "1\n"; } else -- cgit v1.2.3 From 1cf4c0732db3cd3c49cadbc60ff6bda08604e6fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:29 +0900 Subject: tracing: Modify soft-mode only if there's no other referrer Modify soft-mode flag only if no other soft-mode referrer (currently only the ftrace triggers) by using a reference counter in each ftrace_event_file. Without this fix, adding and removing several different enable/disable_event triggers on the same event clear soft-mode bit from the ftrace_event_file. This also happens with a typo of glob on setting triggers. e.g. # echo vfs_symlink:enable_event:net:netif_rx > set_ftrace_filter # cat events/net/netif_rx/enable 0* # echo typo_func:enable_event:net:netif_rx > set_ftrace_filter # cat events/net/netif_rx/enable 0 # cat set_ftrace_filter #### all functions enabled #### vfs_symlink:enable_event:net:netif_rx:unlimited As above, we still have a trigger, but soft-mode is gone. Link: http://lkml.kernel.org/r/20130509054429.30398.7464.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: David Sharp Cc: Hiraku Toyooka Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 915c136d7bd1..8be1224046f8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -251,7 +251,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, switch (enable) { case 0: /* - * When soft_disable is set and enable is cleared, we want + * When soft_disable is set and enable is cleared, the sm_ref + * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED @@ -263,6 +264,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ if (soft_disable) { + if (atomic_dec_return(&file->sm_ref) > 0) + break; disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); } else @@ -291,8 +294,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, */ if (!soft_disable) clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); - else + else { + if (atomic_inc_return(&file->sm_ref) > 1) + break; set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { @@ -1540,6 +1546,7 @@ __trace_add_new_event(struct ftrace_event_call *call, file->event_call = call; file->tr = tr; + atomic_set(&file->sm_ref, 0); list_add(&file->list, &tr->events); return event_create_dir(tr->event_dir, file, id, enable, filter, format); @@ -1562,6 +1569,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call, file->event_call = call; file->tr = tr; + atomic_set(&file->sm_ref, 0); list_add(&file->list, &tr->events); return 0; -- cgit v1.2.3 From da511bf33e47ea1f33f4b672f7da166d2a1b8a91 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 15:00:07 -0400 Subject: tracing: Add helper function trace_create_new_event() to remove duplicate code Both __trace_add_new_event() and __trace_early_add_new_event() do basically the same thing, except that __trace_add_new_event() does a little more. Instead of having duplicate code between the two functions, add a helper function trace_create_new_event() that both can use. This will help against having bugs fixed in one function but not the other. Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8be1224046f8..7a0cf68027cc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1529,6 +1529,24 @@ __register_event(struct ftrace_event_call *call, struct module *mod) return 0; } +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0); + list_add(&file->list, &tr->events); + + return file; +} + /* Add an event to a trace directory */ static int __trace_add_new_event(struct ftrace_event_call *call, @@ -1540,15 +1558,10 @@ __trace_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kmem_cache_alloc(file_cachep, GFP_TRACE); + file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - file->event_call = call; - file->tr = tr; - atomic_set(&file->sm_ref, 0); - list_add(&file->list, &tr->events); - return event_create_dir(tr->event_dir, file, id, enable, filter, format); } @@ -1563,15 +1576,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kmem_cache_alloc(file_cachep, GFP_TRACE); + file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; - file->event_call = call; - file->tr = tr; - atomic_set(&file->sm_ref, 0); - list_add(&file->list, &tr->events); - return 0; } -- cgit v1.2.3 From 5ae0bf5972b66d35e5674e1b7d855b1e111a68ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 18:20:37 -0400 Subject: ftrace: Fix locking in register_ftrace_function_probe() The iteration of the ftrace function list and the call to ftrace_match_record() need to be protected by the ftrace_lock. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cacf0856191e..f104c45cbcc1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3017,14 +3017,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { count = -ENOMEM; - goto out_unlock; + goto out; } if (unlikely(ftrace_disabled)) { count = -ENODEV; - goto out_unlock; + goto out; } + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { if (!ftrace_match_record(rec, NULL, search, len, type)) @@ -3070,15 +3072,15 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); - mutex_lock(&ftrace_lock); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); if (ret < 0) count = ret; __enable_ftrace_function_probe(); - mutex_unlock(&ftrace_lock); out_unlock: + mutex_unlock(&ftrace_lock); + out: mutex_unlock(&trace_probe_ops.regex_lock); free_ftrace_hash(hash); -- cgit v1.2.3 From 23ea9c4dda129fe1711f9fbda03c7a9c91cf1322 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 19:31:48 -0400 Subject: ftrace: Fix the output of enabled_functions debug file The enabled_functions debugfs file was created to be able to see what functions have been modified from nops to calling a tracer. The current method uses the counter in the function record. As when a ftrace_ops is registered to a function, its count increases. But that doesn't mean that the function is actively being traced. /proc/sys/kernel/ftrace_enabled can be set to zero which would disable it, as well as something can go wrong and we can think its enabled when only the counter is set. The record's FTRACE_FL_ENABLED flag is set or cleared when its function is modified. That is a much more accurate way of knowing what function is enabled or not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f104c45cbcc1..dcca9fa29bf4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2459,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && - !(rec->flags & ~FTRACE_FL_MASK))) { + !(rec->flags & FTRACE_FL_ENABLED))) { rec = NULL; goto retry; -- cgit v1.2.3 From 19dd603e45199d93d61e9853c596d098e04e5d66 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 9 May 2013 19:37:36 -0400 Subject: ftrace: Fix function probe when more than one probe is added When the first function probe is added and the function tracer is updated the functions are modified to call the probe. But when a second function is added, it updates the function records to have the second function also update, but it fails to update the actual function itself. This prevents the second (or third or forth and so on) probes from having their functions called. # echo vfs_symlink:enable_event:sched:sched_switch > set_ftrace_filter # echo vfs_unlink:enable_event:sched:sched_switch > set_ftrace_filter # cat trace # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | # touch /tmp/a # rm /tmp/a # cat trace # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | # ln -s /tmp/a # cat trace # tracer: nop # # entries-in-buffer/entries-written: 414/414 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [000] d..3 2847.923031: sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=2786 next_prio=120 <...>-3114 [001] d..4 2847.923035: sched_switch: prev_comm=ln prev_pid=3114 prev_prio=120 prev_state=x ==> next_comm=swapper/1 next_pid=0 next_prio=120 bash-2786 [000] d..3 2847.923535: sched_switch: prev_comm=bash prev_pid=2786 prev_prio=120 prev_state=S ==> next_comm=kworker/0:1 next_pid=34 next_prio=120 kworker/0:1-34 [000] d..3 2847.923552: sched_switch: prev_comm=kworker/0:1 prev_pid=34 prev_prio=120 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120 -0 [002] d..3 2847.923554: sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=sshd next_pid=2783 next_prio=120 sshd-2783 [002] d..3 2847.923660: sched_switch: prev_comm=sshd prev_pid=2783 prev_prio=120 prev_state=S ==> next_comm=swapper/2 next_pid=0 next_prio=120 Still need to update the functions even though the probe itself does not need to be registered again when added a new probe. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcca9fa29bf4..b549b0f5b977 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2941,8 +2941,12 @@ static void __enable_ftrace_function_probe(void) int ret; int i; - if (ftrace_probe_registered) + if (ftrace_probe_registered) { + /* still need to update the function call sites */ + if (ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); return; + } for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; -- cgit v1.2.3 From db02038f4e6a776fd3a0bb71242be37ff378ce86 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:32 +0900 Subject: tracing/kprobes: Use bool for retprobe checker Use bool instead of int for kretprobe checker. Link: http://lkml.kernel.org/r/20130509054431.30398.38561.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 69286337fd7e..0b7386a54b1e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -46,7 +46,7 @@ struct trace_probe { (sizeof(struct probe_arg) * (n))) -static __kprobes int trace_probe_is_return(struct trace_probe *tp) +static __kprobes bool trace_probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; } -- cgit v1.2.3 From 48182bd2261766b810e4e4269a23236c1ace63fb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:36 +0900 Subject: tracing/kprobes: Increment probe hit-count even if it is used by perf Increment probe hit-count for profiling even if it is used by perf tool. Same thing has already done in trace_uprobe. Link: http://lkml.kernel.org/r/20130509054436.30398.21133.stgit@mhiramat-M0-7522 Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0b7386a54b1e..6e86fbbae337 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -733,8 +733,6 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; - tp->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -767,8 +765,6 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; - tp->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -1075,6 +1071,8 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + tp->nhit++; + if (tp->flags & TP_FLAG_TRACE) kprobe_trace_func(kp, regs); #ifdef CONFIG_PERF_EVENTS @@ -1089,6 +1087,8 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + tp->nhit++; + if (tp->flags & TP_FLAG_TRACE) kretprobe_trace_func(ri, regs); #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From 2b106aabe6c566ba19c352f22683381e1ea41326 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:41 +0900 Subject: tracing/kprobes: Pass trace_probe directly from dispatcher Pass the pointer of struct trace_probe directly from probe dispatcher to handlers. This removes redundant container_of macro uses. Same thing has already done in trace_uprobe. Link: http://lkml.kernel.org/r/20130509054441.30398.69112.stgit@mhiramat-M0-7522 Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6e86fbbae337..9ca44fc3fb0b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -723,9 +723,9 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, } /* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static __kprobes void +kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -745,7 +745,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)kp->addr; + entry->ip = (unsigned long)tp->rp.kp.addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) @@ -754,10 +754,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) } /* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static __kprobes void +kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -973,10 +973,9 @@ static int set_print_fmt(struct trace_probe *tp) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, - struct pt_regs *regs) +static __kprobes void +kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; @@ -995,7 +994,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->ip = (unsigned long)kp->addr; + entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); @@ -1005,10 +1004,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, } /* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static __kprobes void +kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; @@ -1074,10 +1073,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) tp->nhit++; if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(kp, regs); + kprobe_trace_func(tp, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(kp, regs); + kprobe_perf_func(tp, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1090,10 +1089,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) tp->nhit++; if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(ri, regs); + kretprobe_trace_func(tp, ri, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(ri, regs); + kretprobe_perf_func(tp, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } -- cgit v1.2.3 From 41a7dd420c57323391d58b553318c1fad8e7ebc2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:49 +0900 Subject: tracing/kprobes: Support ftrace_event_file base multibuffer Support multi-buffer on kprobe-based dynamic events by using ftrace_event_file. Link: http://lkml.kernel.org/r/20130509054449.30398.88343.stgit@mhiramat-M0-7522 Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 250 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 214 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9ca44fc3fb0b..fee865d8a7c4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,7 +27,6 @@ /** * Kprobe event core functions */ - struct trace_probe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ @@ -36,6 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; + struct ftrace_event_file **files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -183,12 +183,57 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } -/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static int enable_trace_probe(struct trace_probe *tp, int flag) +static int trace_probe_nr_files(struct trace_probe *tp) +{ + struct ftrace_event_file **file = tp->files; + int ret = 0; + + if (file) + while (*(file++)) + ret++; + + return ret; +} + +static DEFINE_MUTEX(probe_enable_lock); + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - tp->flags |= flag; + mutex_lock(&probe_enable_lock); + + if (file) { + struct ftrace_event_file **new, **old = tp->files; + int n = trace_probe_nr_files(tp); + + /* 1 is for new one and 1 is for stopper */ + new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto out_unlock; + } + memcpy(new, old, n * sizeof(struct ftrace_event_file *)); + new[n] = file; + /* The last one keeps a NULL */ + + rcu_assign_pointer(tp->files, new); + tp->flags |= TP_FLAG_TRACE; + + if (old) { + /* Make sure the probe is done with old files */ + synchronize_sched(); + kfree(old); + } + } else + tp->flags |= TP_FLAG_PROFILE; + if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) @@ -197,19 +242,83 @@ static int enable_trace_probe(struct trace_probe *tp, int flag) ret = enable_kprobe(&tp->rp.kp); } + out_unlock: + mutex_unlock(&probe_enable_lock); + return ret; } -/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static void disable_trace_probe(struct trace_probe *tp, int flag) +static int +trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +{ + int i; + + if (tp->files) { + for (i = 0; tp->files[i]; i++) + if (tp->files[i] == file) + return i; + } + + return -1; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { - tp->flags &= ~flag; + int ret = 0; + + mutex_lock(&probe_enable_lock); + + if (file) { + struct ftrace_event_file **new, **old = tp->files; + int n = trace_probe_nr_files(tp); + int i, j; + + if (n == 0 || trace_probe_file_index(tp, file) < 0) { + ret = -EINVAL; + goto out_unlock; + } + + if (n == 1) { /* Remove the last file */ + tp->flags &= ~TP_FLAG_TRACE; + new = NULL; + } else { + new = kzalloc(n * sizeof(struct ftrace_event_file *), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto out_unlock; + } + + /* This copy & check loop copies the NULL stopper too */ + for (i = 0, j = 0; j < n && i < n + 1; i++) + if (old[i] != file) + new[j++] = old[i]; + } + + rcu_assign_pointer(tp->files, new); + + /* Make sure the probe is done with old files */ + synchronize_sched(); + kfree(old); + } else + tp->flags &= ~TP_FLAG_PROFILE; + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_probe_is_return(tp)) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); } + + out_unlock: + mutex_unlock(&probe_enable_lock); + + return ret; } /* Internal register function - just handle k*probes and flags */ @@ -724,7 +833,8 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, /* Kprobe handler */ static __kprobes void -kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -733,14 +843,17 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; + WARN_ON(call != ftrace_file->event_call); + local_save_flags(irq_flags); pc = preempt_count(); dsize = __get_data_size(tp, regs); size = sizeof(*entry) + tp->size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; @@ -753,10 +866,23 @@ kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) irq_flags, pc, regs); } +static __kprobes void +kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +{ + struct ftrace_event_file **file = tp->files; + + /* Note: preempt is already disabled around the kprobe handler */ + while (*file) { + __kprobe_trace_func(tp, regs, *file); + file++; + } +} + /* Kretprobe handler */ static __kprobes void -kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, - struct pt_regs *regs) +__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -765,14 +891,17 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; + WARN_ON(call != ftrace_file->event_call); + local_save_flags(irq_flags); pc = preempt_count(); dsize = __get_data_size(tp, regs); size = sizeof(*entry) + tp->size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; @@ -786,6 +915,19 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, irq_flags, pc, regs); } +static __kprobes void +kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct ftrace_event_file **file = tp->files; + + /* Note: preempt is already disabled around the kprobe handler */ + while (*file) { + __kretprobe_trace_func(tp, ri, regs, *file); + file++; + } +} + /* Event entry printers */ enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, @@ -1041,20 +1183,19 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, TP_FLAG_TRACE); + return enable_trace_probe(tp, file); case TRACE_REG_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_TRACE); - return 0; + return disable_trace_probe(tp, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, TP_FLAG_PROFILE); + return enable_trace_probe(tp, NULL); case TRACE_REG_PERF_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_PROFILE); - return 0; + return disable_trace_probe(tp, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1190,11 +1331,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, return a1 + a2 + a3 + a4 + a5 + a6; } +static struct ftrace_event_file * +find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) + if (file->event_call == &tp->call) + return file; + + return NULL; +} + static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); struct trace_probe *tp; + struct ftrace_event_file *file; target = kprobe_trace_selftest_target; @@ -1204,31 +1358,43 @@ static __init int kprobe_trace_self_tests_init(void) "$stack $stack0 +0($stack)", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function entry.\n"); + pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + pr_warn("error on getting new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_probe(tp, file); + } } ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " "$retval", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function return.\n"); + pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + pr_warn("error on getting 2nd new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_probe(tp, file); + } } if (warn) @@ -1239,27 +1405,39 @@ static __init int kprobe_trace_self_tests_init(void) /* Disable trace points before removing it */ tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting test probe.\n"); + pr_warn("error on getting test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_probe(tp, file); + } tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting 2nd test probe.\n"); + pr_warn("error on getting 2nd test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tp, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_probe(tp, file); + } ret = traceprobe_command("-:testprobe", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } ret = traceprobe_command("-:testprobe2", create_trace_probe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } -- cgit v1.2.3 From b8820084f2130b3dcfb09c78ac16cdd2194a345b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 May 2013 14:44:54 +0900 Subject: tracing/kprobes: Support soft-mode disabling Support soft-mode disabling on kprobe-based dynamic events. Soft-disabling is just ignoring recording if the soft disabled flag is set. Link: http://lkml.kernel.org/r/20130509054454.30398.7237.stgit@mhiramat-M0-7522 Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index fee865d8a7c4..636d45fe69b3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -845,6 +845,9 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, WARN_ON(call != ftrace_file->event_call); + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + return; + local_save_flags(irq_flags); pc = preempt_count(); @@ -893,6 +896,9 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, WARN_ON(call != ftrace_file->event_call); + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + return; + local_save_flags(irq_flags); pc = preempt_count(); -- cgit v1.2.3 From d3251859168b0b12841e1b90d6d768ab478dc23d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 May 2013 11:10:17 -0700 Subject: workqueue: workqueue_congested() shouldn't translate WORK_CPU_UNBOUND into node number df2d5ae499 ("workqueue: map an unbound workqueues to multiple per-node pool_workqueues") made unbound workqueues to map to multiple per-node pool_workqueues and accordingly updated workqueue_contested() so that, for unbound workqueues, it maps the specified @cpu to the NUMA node number to obtain the matching pool_workqueue to query the congested state. Before this change, workqueue_congested() ignored @cpu for unbound workqueues as there was only one pool_workqueue and some users (fscache) called it with WORK_CPU_UNBOUND. After the commit, this causes the following oops as WORK_CPU_UNBOUND gets translated to garbage by cpu_to_node(). BUG: unable to handle kernel paging request at ffff8803598d98b8 IP: [] unbound_pwq_by_node+0xa1/0xfa PGD 2421067 PUD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 2689 Comm: cat Tainted: GF 3.9.0-fsdevel+ #4 task: ffff88003d801040 ti: ffff880025806000 task.ti: ffff880025806000 RIP: 0010:[] [] unbound_pwq_by_node+0xa1/0xfa RSP: 0018:ffff880025807ad8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff8800388a2400 RCX: 0000000000000003 RDX: ffff880025807fd8 RSI: ffffffff81a31420 RDI: ffff88003d8016e0 RBP: ffff880025807ae8 R08: ffff88003d801730 R09: ffffffffa00b4898 R10: ffffffff81044217 R11: ffff88003d801040 R12: 0000000064206e97 R13: ffff880036059d98 R14: ffff880038cc8080 R15: ffff880038cc82d0 FS: 00007f21afd9c740(0000) GS:ffff88003d100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff8803598d98b8 CR3: 000000003df49000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff8800388a2400 0000000000000002 ffff880025807b18 ffffffff810442ce ffffffff81044217 ffff880000000002 ffff8800371b4080 ffff88003d112ec0 ffff880025807b38 ffffffffa00810b0 ffff880036059d88 ffff880036059be8 Call Trace: [] workqueue_congested+0xb7/0x12c [] fscache_enqueue_object+0xb2/0xe8 [fscache] [] __fscache_acquire_cookie+0x3b9/0x56c [fscache] [] nfs_fscache_set_inode_cookie+0xee/0x132 [nfs] [] do_open+0x9/0xd [nfs] [] do_dentry_open+0x175/0x24b [] finish_open+0x41/0x51 Fix it by using smp_processor_id() if @cpu is WORK_CPU_UNBOUND. Signed-off-by: Tejun Heo Reported-by: David Howells Tested-and-Acked-by: David Howells --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..1ae602809efb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void) * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * + * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. + * Note that both per-cpu and unbound workqueues may be associated with + * multiple pool_workqueues which have separate congested states. A + * workqueue being congested on one CPU doesn't mean the workqueue is also + * contested on other CPUs / NUMA nodes. + * * RETURNS: * %true if congested, %false otherwise. */ @@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) rcu_read_lock_sched(); + if (cpu == WORK_CPU_UNBOUND) + cpu = smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else -- cgit v1.2.3 From 4b0c0f294f60abcdd20994a8341a95c8ac5eeb96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 15:02:50 +0200 Subject: tick: Cleanup NOHZ per cpu data on cpu down Prarit reported a crash on CPU offline/online. The reason is that on CPU down the NOHZ related per cpu data of the dead cpu is not cleaned up. If at cpu online an interrupt happens before the per cpu tick device is registered the irq_enter() check potentially sees stale data and dereferences a NULL pointer. Cleanup the data after the cpu is dead. Reported-by: Prarit Bhargava Cc: stable@vger.kernel.org Cc: Mike Galbraith Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305031451561.2886@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf19095..0eed1db2d792 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -904,7 +904,7 @@ void tick_cancel_sched_timer(int cpu) hrtimer_cancel(&ts->sched_timer); # endif - ts->nohz_mode = NOHZ_MODE_INACTIVE; + memset(ts, 0, sizeof(*ts)); } #endif -- cgit v1.2.3 From d6cbf35dac8a3dadb9103379820c96d7c85df3d9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 May 2013 19:44:20 +0800 Subject: cgroup: initialize xattr before calling d_instantiate() cgroup_create_file() calls d_instantiate(), which may decide to look at the xattrs on the file. Smack always does this and SELinux can be configured to do so. But cgroup_add_file() didn't initialize xattrs before calling cgroup_create_file(), which finally leads to dereferencing NULL dentry->d_fsdata. This bug has been there since cgroup xattr was introduced. Cc: # 3.8.x Reported-by: Ivan Bulatovic Reported-by: Casey Schaufler Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f80..38b136553044 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2699,13 +2699,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, goto out; } + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } -- cgit v1.2.3 From f7ea0fd639c2c48d3c61b6eec75362be290c6874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 May 2013 21:40:27 +0200 Subject: tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline commit 5b39939a4 (nohz: Move ts->idle_calls incrementation into strict idle logic) moved code out of tick_nohz_stop_sched_tick() and missed to bail out when the cpu is offline. That's causing subsequent failures as an offline CPU is supposed to die and not to fiddle with nohz magic. Return false in can_stop_idle_tick() if the cpu is offline. Reported-and-tested-by: Jiri Kosina Reported-and-tested-by: Prarit Bhargava Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Tony Luck Cc: x86@kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305132138160.2863@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0eed1db2d792..012142187db9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -469,6 +469,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -- cgit v1.2.3 From b47430d3adbedbfdb5979ba4874f5dadf94f16b1 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 14 May 2013 04:01:27 +0530 Subject: rcu/idle: Wrap cpu-idle poll mode within rcu_idle_enter/exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bjørn Mork reported the following warning when running powertop. [ 49.289034] ------------[ cut here ]------------ [ 49.289055] WARNING: at kernel/rcutree.c:502 rcu_eqs_exit_common.isra.48+0x3d/0x125() [ 49.289244] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.0-bisect-rcu-warn+ #107 [ 49.289251] ffffffff8157d8c8 ffffffff81801e28 ffffffff8137e4e3 ffffffff81801e68 [ 49.289260] ffffffff8103094f ffffffff81801e68 0000000000000000 ffff88023afcd9b0 [ 49.289268] 0000000000000000 0140000000000000 ffff88023bee7700 ffffffff81801e78 [ 49.289276] Call Trace: [ 49.289285] [] dump_stack+0x19/0x1b [ 49.289293] [] warn_slowpath_common+0x62/0x7b [ 49.289300] [] warn_slowpath_null+0x15/0x17 [ 49.289306] [] rcu_eqs_exit_common.isra.48+0x3d/0x125 [ 49.289314] [] ? trace_hardirqs_off_caller+0x37/0xa6 [ 49.289320] [] rcu_idle_exit+0x85/0xa8 [ 49.289327] [] trace_cpu_idle_rcuidle+0xae/0xff [ 49.289334] [] cpu_startup_entry+0x72/0x115 [ 49.289341] [] rest_init+0x149/0x150 [ 49.289347] [] ? csum_partial_copy_generic+0x16c/0x16c [ 49.289355] [] start_kernel+0x3f0/0x3fd [ 49.289362] [] ? repair_env_string+0x5a/0x5a [ 49.289368] [] x86_64_start_reservations+0x2a/0x2c [ 49.289375] [] x86_64_start_kernel+0xcd/0xd1 [ 49.289379] ---[ end trace 07a1cc95e29e9036 ]--- The warning is that 'rdtp->dynticks' has an unexpected value, which roughly translates to - the calls to rcu_idle_enter() and rcu_idle_exit() were not made in the correct order, or otherwise messed up. And Bjørn's painstaking debugging indicated that this happens when the idle loop enters the poll mode. Looking at the poll function cpu_idle_poll(), and the implementation of trace_cpu_idle_rcuidle(), the problem becomes very clear: cpu_idle_poll() lacks calls to rcu_idle_enter/exit(), and trace_cpu_idle_rcuidle() calls them in the reverse order - first rcu_idle_exit(), and then rcu_idle_enter(). Hence the even/odd alternative sequencing of rdtp->dynticks goes for a toss. And powertop readily triggers this because powertop uses the idle-tracing infrastructure extensively. So, to fix this, wrap the code in cpu_idle_poll() within rcu_idle_enter/exit(), so that it blends properly with the calls inside trace_cpu_idle_rcuidle() and thus get the function ordering right. Reported-and-tested-by: Bjørn Mork Cc: Paul McKenney Cc: Steven Rostedt Cc: Dipankar Sarma Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/519169BF.4080208@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 8b86c0c68edf..d5585f5e038e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup); static inline int cpu_idle_poll(void) { + rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); return 1; } -- cgit v1.2.3 From 42a5cf46cd56f46267d2a9fcf2655f4078cd3042 Mon Sep 17 00:00:00 2001 From: Tirupathi Reddy Date: Tue, 14 May 2013 13:59:02 +0530 Subject: timer: Don't reinitialize the cpu base lock during CPU_UP_PREPARE An inactive timer's base can refer to a offline cpu's base. In the current code, cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is trying to modify an inactive timer on that CPU with holding its timer base lock, then the lock will be reinitialized under its feet. This leads to following SPIN_BUG(). <0> BUG: spinlock already unlocked on CPU#3, kworker/u:3/1466 <0> lock: 0xe3ebe000, .magic: dead4ead, .owner: kworker/u:3/1466, .owner_cpu: 1 <4> [] (unwind_backtrace+0x0/0x11c) from [] (do_raw_spin_unlock+0x40/0xcc) <4> [] (do_raw_spin_unlock+0x40/0xcc) from [] (_raw_spin_unlock+0x8/0x30) <4> [] (_raw_spin_unlock+0x8/0x30) from [] (mod_timer+0x294/0x310) <4> [] (mod_timer+0x294/0x310) from [] (queue_delayed_work_on+0x104/0x120) <4> [] (queue_delayed_work_on+0x104/0x120) from [] (sdhci_msm_bus_voting+0x88/0x9c) <4> [] (sdhci_msm_bus_voting+0x88/0x9c) from [] (sdhci_disable+0x40/0x48) <4> [] (sdhci_disable+0x40/0x48) from [] (mmc_release_host+0x4c/0xb0) <4> [] (mmc_release_host+0x4c/0xb0) from [] (mmc_sd_detect+0x90/0xfc) <4> [] (mmc_sd_detect+0x90/0xfc) from [] (mmc_rescan+0x7c/0x2c4) <4> [] (mmc_rescan+0x7c/0x2c4) from [] (process_one_work+0x27c/0x484) <4> [] (process_one_work+0x27c/0x484) from [] (worker_thread+0x210/0x3b0) <4> [] (worker_thread+0x210/0x3b0) from [] (kthread+0x80/0x8c) <4> [] (kthread+0x80/0x8c) from [] (kernel_thread_exit+0x0/0x8) As an example, this particular crash occurred when CPU #3 is executing mod_timer() on an inactive timer whose base is refered to offlined CPU #2. The code locked the timer_base corresponding to CPU #2. Before it could proceed, CPU #2 came online and reinitialized the spinlock corresponding to its base. Thus now CPU #3 held a lock which was reinitialized. When CPU #3 finally ended up unlocking the old cpu_base corresponding to CPU #2, we hit the above SPIN_BUG(). CPU #0 CPU #3 CPU #2 ------ ------- ------- ..... ...... mod_timer() lock_timer_base spin_lock_irqsave(&base->lock) cpu_up(2) ..... ...... init_timers_cpu() .... ..... spin_lock_init(&base->lock) ..... spin_unlock_irqrestore(&base->lock) ...... Allocation of per_cpu timer vector bases is done only once under "tvec_base_done[]" check. In the current code, spinlock_initialization of base->lock isn't under this check. When a CPU is up each time the base lock is reinitialized. Move base spinlock initialization under the check. Signed-off-by: Tirupathi Reddy Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1368520142-4136-1-git-send-email-tirupath@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce9771..7376589adc28 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu) boot_done = 1; base = &boot_tvec_bases; } + spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; } else { base = per_cpu(tvec_bases, cpu); } - spin_lock_init(&base->lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); -- cgit v1.2.3 From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2013 06:53:37 -0700 Subject: rcu: Fix comparison sense in rcu_needs_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks) introduced a bug that can result in excessively long grace periods. This bug reverse the senes of the "if" statement checking for lazy callbacks, so that RCU takes a lazy approach when there are in fact non-lazy callbacks. This can result in excessive boot, suspend, and resume times. This commit therefore fixes the sense of this "if" statement. Reported-by: Borislav Petkov Reported-by: Bjørn Mork Reported-by: Joerg Roedel Signed-off-by: Paul E. McKenney Tested-by: Bjørn Mork Tested-by: Joerg Roedel --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 170814dc418f..6d939a645da1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) rdtp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (rdtp->all_lazy) { + if (!rdtp->all_lazy) { *dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { -- cgit v1.2.3 From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 1 May 2013 00:07:00 +0900 Subject: workqueue: correct handling of the pool spin_lock When we fail to mutex_trylock(), we release the pool spin_lock and do mutex_lock(). After that, we should regrab the pool spin_lock, but, regrabbing is missed in current code. So correct it. Cc: Lai Jiangshan Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ae602809efb..286847b90225 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); ret = true; } -- cgit v1.2.3 From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 6 May 2013 17:44:55 -0400 Subject: workqueue: Make schedule_work() available again to non GPL modules Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed schedule_work() and schedule_delayed_work() to inline wrappers, but these rely on some symbols that are EXPORT_SYMBOL_GPL, while the original functions were EXPORT_SYMBOL. This has the effect of changing the licensing requirement for these functions and making them unavailable to non GPL modules. Make them available again by removing the restriction on the required symbols. Signed-off-by: Marc Dionne Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 286847b90225..02916f421385 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; @@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL(queue_work_on); void delayed_work_timer_fn(unsigned long __data) { @@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU -- cgit v1.2.3 From b4f711ee03d28f776fd2324fd0bd999cc428e4d2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 24 Apr 2013 11:32:56 -0700 Subject: time: Revert ALWAYS_USE_PERSISTENT_CLOCK compile time optimizaitons Kay Sievers noted that the ALWAYS_USE_PERSISTENT_CLOCK config, which enables some minor compile time optimization to avoid uncessary code in mostly the suspend/resume path could cause problems for userland. In particular, the dependency for RTC_HCTOSYS on !ALWAYS_USE_PERSISTENT_CLOCK, which avoids setting the time twice and simplifies suspend/resume, has the side effect of causing the /sys/class/rtc/rtcN/hctosys flag to always be zero, and this flag is commonly used by udev to setup the /dev/rtc symlink to /dev/rtcN, which can cause pain for older applications. While the udev rules could use some work to be less fragile, breaking userland should strongly be avoided. Additionally the compile time optimizations are fairly minor, and the code being optimized is likely to be reworked in the future, so lets revert this change. Reported-by: Kay Sievers Signed-off-by: John Stultz Cc: stable #3.9 Cc: Feng Tang Cc: Jason Gunthorpe Link: http://lkml.kernel.org/r/1366828376-18124-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..b69692250af4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool -# Platforms has a persistent clock -config ALWAYS_USE_PERSISTENT_CLOCK - bool - default n - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool -- cgit v1.2.3 From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 26 Mar 2013 11:35:16 -0400 Subject: rcu: Don't allocate bootmem from rcu_init() When rcu_init() is called we already have slab working, allocating bootmem at that point results in warnings and an allocation from slab. This commit therefore changes alloc_bootmem_cpumask_var() to alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called from rcu_init(). Signed-off-by: Sasha Levin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Robin Holt [paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.] --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6d939a645da1..3db5a375d8dd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { - alloc_bootmem_cpumask_var(&rcu_nocb_mask); + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO -- cgit v1.2.3 From 60705c89460fdc7227f2d153b68b3f34814738a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 May 2013 15:40:48 -0400 Subject: tracing: Fix leaks of filter preds Special preds are created when folding a series of preds that can be done in serial. These are allocated in an ops field of the pred structure. But they were never freed, causing memory leaks. This was discovered using the kmemleak checker: unreferenced object 0xffff8800797fd5e0 (size 32): comm "swapper/0", pid 1, jiffies 4294690605 (age 104.608s) hex dump (first 32 bytes): 00 00 01 00 03 00 05 00 07 00 09 00 0b 00 0d 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x73/0x98 [] kmemleak_alloc_recursive.constprop.42+0x16/0x18 [] __kmalloc+0xd7/0x125 [] kcalloc.constprop.24+0x2d/0x2f [] fold_pred_tree_cb+0xa9/0xf4 [] walk_pred_tree+0x47/0xcc [] replace_preds.isra.20+0x6f8/0x72f [] create_filter+0x4e/0x8b [] ftrace_test_event_filter+0x5a/0x155 [] do_one_initcall+0xa0/0x137 [] kernel_init_freeable+0x14d/0x1dc [] kernel_init+0xe/0xdb [] ret_from_fork+0x7c/0xb0 [] 0xffffffffffffffff Cc: Tom Zanussi Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a6361178de5a..e1b653f7e1ca 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { + int i; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); kfree(filter->preds); filter->preds = NULL; } -- cgit v1.2.3 From c02c7e65d9b13670e34bc523744cf4f6e99c198a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:34 +0900 Subject: tracing/kprobes: Use rcu_dereference_raw for tp->files Use rcu_dereference_raw() for accessing tp->files. Because the write-side uses rcu_assign_pointer() for memory barrier, the read-side also has to use rcu_dereference_raw() with read memory barrier. Link: http://lkml.kernel.org/r/20130513115834.6545.17022.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 47 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 636d45fe69b3..0a3d8d5c483d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event, static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file = tp->files; + struct ftrace_event_file **file; int ret = 0; + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); + old = rcu_dereference_raw(tp->files); /* 1 is for new one and 1 is for stopper */ new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), GFP_KERNEL); @@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) static int trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) { + struct ftrace_event_file **files; int i; - if (tp->files) { - for (i = 0; tp->files[i]; i++) - if (tp->files[i] == file) + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + files = rcu_dereference_raw(tp->files); + if (files) { + for (i = 0; files[i]; i++) + if (files[i] == file) return i; } @@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); int i, j; + old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; goto out_unlock; @@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kprobe_trace_func(tp, regs, *file); file++; @@ -925,9 +945,16 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kretprobe_trace_func(tp, ri, regs, *file); file++; -- cgit v1.2.3 From 3d1fc7b0880c4db612a3d3211a808659e28af588 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:37 +0900 Subject: tracing/kprobes: Fix a sparse warning for incorrect type in assignment Fix a sparse warning about the rcu operated pointer is defined without __rcu address space. Link: http://lkml.kernel.org/r/20130513115837.6545.23322.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0a3d8d5c483d..81c5109b9d00 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file **files; + struct ftrace_event_file * __rcu *files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; -- cgit v1.2.3 From b62fdd97fcae17e483b005bafd13fadbd9840672 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:39 +0900 Subject: tracing/kprobes: Make print_*probe_event static According to sparse warning, print_*probe_event static because those functions are not directly called from outside. Link: http://lkml.kernel.org/r/20130513115839.6545.83067.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 81c5109b9d00..9f46e98ba8f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -962,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, } /* Event entry printers */ -enum print_line_t +static enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -998,7 +998,7 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -enum print_line_t +static enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { -- cgit v1.2.3 From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 May 2013 14:24:24 -0700 Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in wq_numa_init() wq_numa_init() builds per-node cpumasks which are later used to make unbound workqueues NUMA-aware. The cpumasks are allocated using alloc_cpumask_var_node() for all possible nodes. Unfortunately, on machines with off-line nodes, this leads to NUMA-aware allocations on existing bug offline nodes, which in turn triggers BUG in the memory allocation code. Fix it by using NUMA_NO_NODE for cpumask allocations for offline nodes. kernel BUG at include/linux/gfp.h:323! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1 Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011 task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000 RIP: 0010:[] [] new_slab+0x2ad/0x340 RSP: 0000:ffff880234603bf8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0 RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0 RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001 R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0 FS: 0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0 ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1 ffff880237816140 0000000000000000 ffff88023740e1c0 Call Trace: [] __slab_alloc+0x330/0x4f2 [] kmem_cache_alloc_node_trace+0xa5/0x200 [] alloc_cpumask_var_node+0x28/0x90 [] wq_numa_init+0x10d/0x1be [] init_workqueues+0x64/0x341 [] do_one_initcall+0xea/0x1a0 [] kernel_init_freeable+0xb7/0x1ec [] kernel_init+0xe/0xf0 [] ret_from_fork+0x7c/0xb0 Code: 45 84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee Signed-off-by: Tejun Heo Reported-and-Tested-by: Lingzhu Xiang --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02916f421385..ee8e29a2320c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); -- cgit v1.2.3 From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 May 2013 20:48:49 +0900 Subject: tracing: Return -EBUSY when event_enable_func() fails to get module Since try_module_get() returns false( = 0) when it fails to pindown a module, event_enable_func() returns 0 which means "succeed". This can cause a kernel panic when the entry is removed, because the event is already released. This fixes the bug by returning -EBUSY, because the reason why it fails is that the module is being removed at that time. Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a0cf68027cc..27963e2bf4bf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ ret = try_module_get(file->event_call->mod); - if (!ret) + if (!ret) { + ret = -EBUSY; goto out_free; + } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) -- cgit v1.2.3 From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 May 2013 17:43:55 +0200 Subject: usermodehelper: check subprocess_info->path != NULL argv_split(empty_or_all_spaces) happily succeeds, it simply returns argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to check sub_info->path != NULL to avoid the crash. This is the minimal fix, todo: - perhaps we should change argv_split() to return NULL or change the callers. - kill or justify ->path[0] check - narrow the scope of helper_lock() Signed-off-by: Oleg Nesterov Acked-By: Lucas De Marchi Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 1296e72e4161..8241906c4b61 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) int retval = 0; helper_lock(); + if (!sub_info->path) { + retval = -EINVAL; + goto out; + } + if (sub_info->path[0] == '\0') goto out; -- cgit v1.2.3 From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:33:01 +0100 Subject: kmemleak: Scan all allocated, writeable and not executable module sections Instead of just picking data sections by name (names that start with .data, .bss or .ref.data), use the section flags and scan all sections that are allocated, writable and not executable. Which should cover all sections of a module that might reference data. Signed-off-by: Steven Rostedt [catalin.marinas@arm.com: removed unused 'name' variable] [catalin.marinas@arm.com: collapsed 'if' blocks] Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..06f496a5d182 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod, kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < info->hdr->e_shnum; i++) { - const char *name = info->secstrings + info->sechdrs[i].sh_name; - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!strstarts(name, ".data") && !strstarts(name, ".bss")) + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) continue; kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, -- cgit v1.2.3 From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:46:23 +0100 Subject: kmemleak: No need for scanning specific module sections As kmemleak now scans all module sections that are allocated, writable and non executable, there's no need to scan individual sections that might reference data. Signed-off-by: Steven Rostedt Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 06f496a5d182..cab4bce49c23 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_bprintk_fmt_start, - sizeof(*mod->trace_bprintk_fmt_start) * - mod->num_trace_bprintk_fmt, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3 From fbe06b7bae7c9cf6ab05168fce5ee93b2f4bae7c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 May 2013 11:49:10 -0700 Subject: x86, range: fix missing merge during add range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Christian found v3.9 does not work with E350 with EFI is enabled. [ 1.658832] Trying to unpack rootfs image as initramfs... [ 1.679935] BUG: unable to handle kernel paging request at ffff88006e3fd000 [ 1.686940] IP: [] memset+0x1f/0xb0 [ 1.692010] PGD 1f77067 PUD 1f7a067 PMD 61420067 PTE 0 but early memtest report all memory could be accessed without problem. early page table is set in following sequence: [ 0.000000] init_memory_mapping: [mem 0x00000000-0x000fffff] [ 0.000000] init_memory_mapping: [mem 0x6e600000-0x6e7fffff] [ 0.000000] init_memory_mapping: [mem 0x6c000000-0x6e5fffff] [ 0.000000] init_memory_mapping: [mem 0x00100000-0x6bffffff] [ 0.000000] init_memory_mapping: [mem 0x6e800000-0x6ea07fff] but later efi_enter_virtual_mode try set mapping again wrongly. [ 0.010644] pid_max: default: 32768 minimum: 301 [ 0.015302] init_memory_mapping: [mem 0x640c5000-0x6e3fcfff] that means it fails with pfn_range_is_mapped. It turns out that we have a bug in add_range_with_merge and it does not merge range properly when new add one fill the hole between two exsiting ranges. In the case when [mem 0x00100000-0x6bffffff] is the hole between [mem 0x00000000-0x000fffff] and [mem 0x6c000000-0x6e7fffff]. Fix the add_range_with_merge by calling itself recursively. Reported-by: "Christian König" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVofGoSk7q5-0irjkBxemqK729cND4hov-1QCBJDhxpgQ@mail.gmail.com Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 071b0ab455cb..eb911dbce267 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range, final_start = min(range[i].start, start); final_end = max(range[i].end, end); - range[i].start = final_start; - range[i].end = final_end; - return nr_range; + /* clear it and add it back for further merge */ + range[i].start = 0; + range[i].end = 0; + return add_range_with_merge(range, az, nr_range, + final_start, final_end); } /* Need to add it: */ -- cgit v1.2.3 From ca1643186d3dce6171d8f171e516b02496360a9e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 11:51:10 -0400 Subject: tracing: Fix crash when ftrace=nop on the kernel command line If ftrace= is on the kernel command line, when that tracer is registered, it will be initiated by tracing_set_tracer() to execute that tracer. The nop tracer is just a stub tracer that is used to have no tracer enabled. It is assigned at early bootup as it is the default tracer. But if ftrace=nop is on the kernel command line, the registering of the nop tracer will call tracing_set_tracer() which will try to execute the nop tracer. But it expects tr->current_trace to be assigned something as it usually is assigned to the nop tracer. As it hasn't been assigned to anything yet, it causes the system to crash. The simple fix is to move the tr->current_trace = nop before registering the nop tracer. The functionality is still the same as the nop tracer doesn't do anything anyway. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ae6fa2d1cdf7..4d79485b3237 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6216,10 +6216,15 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); - register_tracer(&nop_trace); - + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ global_trace.current_trace = &nop_trace; + register_tracer(&nop_trace); + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3 From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:50:24 +0900 Subject: cgroup: fix a subtle bug in descendant pre-order walk When cgroup_next_descendant_pre() initiates a walk, it checks whether the subtree root doesn't have any children and if not returns NULL. Later code assumes that the subtree isn't empty. This is broken because the subtree may become empty inbetween, which can lead to the traversal escaping the subtree by walking to the sibling of the subtree root. There's no reason to have the early exit path. Remove it along with the later assumption that the subtree isn't empty. This simplifies the code a bit and fixes the subtle bug. While at it, fix the comment of cgroup_for_each_descendant_pre() which was incorrectly referring to ->css_offline() instead of ->css_online(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Cc: stable@vger.kernel.org --- kernel/cgroup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38b136553044..31e9ef319070 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } -- cgit v1.2.3 From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:25 -0700 Subject: auditfilter.c: fix kernel-doc warnings Fix kernel-doc warnings in kernel/auditfilter.c: Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 83a2970295d1..6bd4a90d1991 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { -- cgit v1.2.3 From 2938d2757fc99c26aa678ce4eba910c4a77c3a55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:33:01 +0200 Subject: tick: Cure broadcast false positive pending bit warning commit 26517f3e (tick: Avoid programming the local cpu timer if broadcast pending) added a warning if the cpu enters broadcast mode again while the pending bit is still set. Meelis reported that the warning triggers. There are two corner cases which have been not considered: 1) cpuidle calls clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) twice. That can result in the following scenario CPU0 CPU1 cpuidle_idle_call() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) set cpu in tick_broadcast_oneshot_mask broadcast interrupt event expired for cpu1 set pending bit acpi_idle_enter_simple() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) WARN_ON(pending bit) Move the WARN_ON into the section where we enter broadcast mode so it wont provide false positives on the second call. 2) safe_halt() enables interrupts, so a broadcast interrupt can be delivered befor the broadcast mode is disabled. That sets the pending bit for the CPU which receives the broadcast interrupt. Though the interrupt is delivered right away from the broadcast handler and leaves the pending bit stale. Clear the pending bit for the current cpu in the broadcast handler. Reported-and-tested-by: Meelis Roos Cc: Len Brown Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305271841130.4220@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..0c739423b0f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -511,6 +511,12 @@ again: } } + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); @@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); /* * We only reprogram the broadcast timer if we -- cgit v1.2.3 From 6721cb60022629ae76365551f05d9658b8d14c55 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 14:21:36 -0400 Subject: ring-buffer: Do not poll non allocated cpu buffers The tracing infrastructure sets up for possible CPUs, but it uses the ring buffer polling, it is possible to call the ring buffer polling code with a CPU that hasn't been allocated. This will cause a kernel oops when it access a ring buffer cpu buffer that is part of the possible cpus but hasn't been allocated yet as the CPU has never been online. Reported-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b59aea2c48c2..e444ff88f0a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } -- cgit v1.2.3 From aa848233f740abbabfa7669daca0ab94aaa37bcd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 3 May 2013 23:27:07 +0200 Subject: ntp: Remove unused variable flags in __hardpps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/ntp.c: In function ‘__hardpps’: kernel/time/ntp.c:877: warning: unused variable ‘flags’ commit a076b2146fabb0894cae5e0189a8ba3f1502d737 ("ntp: Remove ntp_lock, using the timekeeping locks to protect ntp state") removed its users, but not the actual variable. Signed-off-by: Geert Uytterhoeven Signed-off-by: John Stultz --- kernel/time/ntp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 12ff13a838c6..8f5b3b98577b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -874,7 +874,6 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; - unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); -- cgit v1.2.3 From 0d6bd9953f739dad96d9a0de65383e479ab4e10d Mon Sep 17 00:00:00 2001 From: Zoran Markovic Date: Fri, 17 May 2013 11:24:05 -0700 Subject: timekeeping: Correct run-time detection of persistent_clock. Since commit 31ade30692dc9680bfc95700d794818fa3f754ac, timekeeping_init() checks for presence of persistent clock by attempting to read a non-zero time value. This is an issue on platforms where persistent_clock (instead is implemented as a free-running counter (instead of an RTC) starting from zero on each boot and running during suspend. Examples are some ARM platforms (e.g. PandaBoard). An attempt to read such a clock during timekeeping_init() may return zero value and falsely declare persistent clock as missing. Additionally, in the above case suspend times may be accounted twice (once from timekeeping_resume() and once from rtc_resume()), resulting in a gradual drift of system time. This patch does a run-time correction of the issue by doing the same check during timekeeping_suspend(). A better long-term solution would have to return error when trying to read non-existing clock and zero when trying to read an uninitialized clock, but that would require changing all persistent_clock implementations. This patch addresses the immediate breakage, for now. Cc: John Stultz Cc: Thomas Gleixner Cc: Feng Tang Cc: stable@vger.kernel.org Signed-off-by: Zoran Markovic [jstultz: Tweaked commit message and subject] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..baeeb5c87cf1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); -- cgit v1.2.3 From 2a0ff3fbe39bc93f719ff857e5a359d9780579ff Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Sun, 26 May 2013 21:33:09 +0800 Subject: cgroup: warn about mismatching options of a new mount of an existing hierarchy With the new __DEVEL__sane_behavior mount option was introduced, if the root cgroup is alive with no xattr function, to mount a new cgroup with xattr will be rejected in terms of design which just fine. However, if the root cgroup does not mounted with __DEVEL__sane_hehavior, to create a new cgroup with xattr option will succeed although after that the EA function does not works as expected but will get ENOTSUPP for setting up attributes under either cgroup. e.g. setfattr: /cgroup2/test: Operation not supported Instead of keeping silence in this case, it's better to drop a log entry in warning level. That would be helpful to understand the reason behind the scene from the user's perspective, and this is essentially an improvement does not break the backward compatibilities. With this fix, above mount attemption will keep up works as usual but the following line cound be found at the system log: [ ...] cgroup: new mount options do not match the existing superblock tj: minor formatting / message updates. Signed-off-by: Jie Liu Reported-by: Alexey Kodanev Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 31e9ef319070..a7c9e6ddb979 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); - if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && - root->flags != opts.flags) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto drop_new_super; + if (root->flags != opts.flags) { + if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } else { + pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + } } /* no subsys rebinding, so refcounts don't change */ -- cgit v1.2.3 From 1bb539ca36e21c2f4fce0865e11df384bc7b7656 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 14:38:43 -0400 Subject: ftrace: Use the rcu _notrace variants for rcu_dereference_raw() and friends As rcu_dereference_raw() under RCU debug config options can add quite a bit of checks, and that tracing uses rcu_dereference_raw(), these checks happen with the function tracer. The function tracer also happens to trace these debug checks too. This added overhead can livelock the system. Have the function tracer use the new RCU _notrace equivalents that do not do the debug checks for RCU. Link: http://lkml.kernel.org/r/20130528184209.467603904@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b549b0f5b977..6c508ff33c62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * can use rcu_dereference_raw_notrace() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw(list); \ + op = rcu_dereference_raw_notrace(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw((op)->next)) && \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ unlikely((op) != &ftrace_list_end)) static inline void ftrace_ops_init(struct ftrace_ops *ops) @@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, hhd, node) { + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, hhd, hlist) { + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, hhd, node) { + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } -- cgit v1.2.3 From 0184d50f9fd17658c232d6ee6d465a87f989d706 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 May 2013 15:56:49 -0400 Subject: tracing: Fix bad parameter passed in branch selftest The branch selftest calls trace_test_buffer(), but with the new code it expects the first parameter to be a pointer to a struct trace_buffer. All self tests were changed but the branch selftest was missed. This caused either a crash or failed test when the branch selftest was enabled. Link: http://lkml.kernel.org/r/20130529141333.GA24064@localhost Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 55e2cf66967b..2901e3b88590 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); -- cgit v1.2.3 From f17a5194859a82afe4164e938b92035b86c55794 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 30 May 2013 21:10:37 -0400 Subject: tracing: Use current_uid() for critical time tracing The irqsoff tracer records the max time that interrupts are disabled. There are hooks in the assembly code that calls back into the tracer when interrupts are disabled or enabled. When they are enabled, the tracer checks if the amount of time they were disabled is larger than the previous recorded max interrupts off time. If it is, it creates a snapshot of the currently running trace to store where the last largest interrupts off time was held and how it happened. During testing, this RCU lockdep dump appeared: [ 1257.829021] =============================== [ 1257.829021] [ INFO: suspicious RCU usage. ] [ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G W [ 1257.829021] ------------------------------- [ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle! [ 1257.829021] [ 1257.829021] other info that might help us debug this: [ 1257.829021] [ 1257.829021] [ 1257.829021] RCU used illegally from idle CPU! [ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0 [ 1257.829021] RCU used illegally from extended quiescent state! [ 1257.829021] 2 locks held by trace-cmd/4831: [ 1257.829021] #0: (max_trace_lock){......}, at: [] stop_critical_timing+0x1a3/0x209 [ 1257.829021] #1: (rcu_read_lock){.+.+..}, at: [] __update_max_tr+0x88/0x1ee [ 1257.829021] [ 1257.829021] stack backtrace: [ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G W 3.10.0-rc1-test+ #171 [ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 [ 1257.829021] 0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8 [ 1257.829021] ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003 [ 1257.829021] ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a [ 1257.829021] Call Trace: [ 1257.829021] [] dump_stack+0x19/0x1b [ 1257.829021] [] lockdep_rcu_suspicious+0x109/0x112 [ 1257.829021] [] __update_max_tr+0xed/0x1ee [ 1257.829021] [] ? __update_max_tr+0x88/0x1ee [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] update_max_tr_single+0x11d/0x12d [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] stop_critical_timing+0x141/0x209 [ 1257.829021] [] ? trace_hardirqs_on+0xd/0xf [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] time_hardirqs_on+0x2a/0x2f [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] trace_hardirqs_on_caller+0x16/0x197 [ 1257.829021] [] trace_hardirqs_on+0xd/0xf [ 1257.829021] [] user_enter+0xfd/0x107 [ 1257.829021] [] do_notify_resume+0x92/0x97 [ 1257.829021] [] int_signal+0x12/0x17 What happened was entering into the user code, the interrupts were enabled and a max interrupts off was recorded. The trace buffer was saved along with various information about the task: comm, pid, uid, priority, etc. The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock() to retrieve the data, and this happened to happen where RCU is blind (user_enter). As only the preempt and irqs off tracers can have this happen, and they both only have the tsk == current, if tsk == current, use current_uid() instead of task_uid(), as current_uid() does not use RCU as only current can change its uid. This fixes the RCU suspicious splat. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d79485b3237..1a41023a1f88 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -843,7 +843,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; -- cgit v1.2.3 From 346dbb79ea0118ebb0df372b35cab9d5805216cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Apr 2013 19:28:54 +0200 Subject: irqdomain: export irq_domain_add_simple All other irq_domain_add_* functions are exported already, and apparently this one got left out by mistake, which causes build errors for ARM allmodconfig kernels: ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-rcar.ko] undefined! ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-em.ko] undefined! Signed-off-by: Arnd Bergmann Acked-by: Simon Horman Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0c..d1adaedb435f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -191,6 +191,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, /* A linear domain is the default */ return irq_domain_add_linear(of_node, size, ops, host_data); } +EXPORT_SYMBOL_GPL(irq_domain_add_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. -- cgit v1.2.3 From 275e31b10ce20613aedceaa5160129c64b260a98 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 14 May 2013 19:02:45 +0800 Subject: kernel/irq/irqdomain.c: before use 'irq_data', need check it whether valid. Since irq_data may be NULL, if so, we WARN_ON(), and continue, 'hwirq' which related with 'irq_data' has to initialize later, or it will cause issue. Signed-off-by: Chen Gang Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d1adaedb435f..8c4c8ea6a205 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,11 +398,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, while (count--) { int irq = irq_base + count; struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irq_data->hwirq; + irq_hw_number_t hwirq; if (WARN_ON(!irq_data || irq_data->domain != domain)) continue; + hwirq = irq_data->hwirq; irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ -- cgit v1.2.3 From 94a63da0ac1a67bfb8b30aec1086523c5031ea5a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 6 Jun 2013 12:10:23 +0100 Subject: irqdomain: document the simple domain first_irq The first_irq needs to be zero to get a linear domain and that comes with special semantics. We want to simplify this going forward but some documentation never hurts. Signed-off-by: Linus Walleij Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c4c8ea6a205..54a4d5223238 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping - * @first_irq: first number of irq block assigned to the domain + * @first_irq: first number of irq block assigned to the domain, + * pass zero to assign irqs on-the-fly. This will result in a + * linear IRQ domain so it is important to use irq_create_mapping() + * for each used IRQ, especially when SPARSE_IRQ is enabled. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * -- cgit v1.2.3 From 34376a50fb1fa095b9d0636fa41ed2e73125f214 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 6 Jun 2013 14:29:49 -0700 Subject: Fix lockup related to stop_machine being stuck in __do_softirq. The stop machine logic can lock up if all but one of the migration threads make it through the disable-irq step and the one remaining thread gets stuck in __do_softirq. The reason __do_softirq can hang is that it has a bail-out based on jiffies timeout, but in the lockup case, jiffies itself is not incremented. To work around this, re-add the max_restart counter in __do_irq and stop processing irqs after 10 restarts. Thanks to Tejun Heo and Rusty Russell and others for helping me track this down. This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce latencies"). It may be worth looking into ath9k to see if it has issues with its irq handler at a later date. The hang stack traces look something like this: ------------[ cut here ]------------ WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7() Watchdog detected hard LOCKUP on cpu 2 Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] Pid: 23, comm: migration/2 Tainted: G C 3.9.4+ #11 Call Trace: warn_slowpath_common+0x85/0x9f warn_slowpath_fmt+0x46/0x48 watchdog_overflow_callback+0x9c/0xa7 __perf_event_overflow+0x137/0x1cb perf_event_overflow+0x14/0x16 intel_pmu_handle_irq+0x2dc/0x359 perf_event_nmi_handler+0x19/0x1b nmi_handle+0x7f/0xc2 do_nmi+0xbc/0x304 end_repeat_nmi+0x1e/0x2e <> cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 ---[ end trace 4947dfa9b0a4cec3 ]--- BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17] Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] irq event stamp: 835637905 hardirqs last enabled at (835637904): __do_softirq+0x9f/0x257 hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80 softirqs last enabled at (5654720): __do_softirq+0x1ff/0x257 softirqs last disabled at (5654725): irq_exit+0x5f/0xbb CPU 1 Pid: 17, comm: migration/1 Tainted: G WC 3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M. RIP: tasklet_hi_action+0xf0/0xf0 Process migration/1 Call Trace: __do_softirq+0x117/0x257 irq_exit+0x5f/0xbb smp_apic_timer_interrupt+0x8a/0x98 apic_timer_interrupt+0x72/0x80 printk+0x4d/0x4f stop_machine_cpu_stop+0x22c/0x274 cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 Signed-off-by: Ben Greear Acked-by: Tejun Heo Acked-by: Pekka Riikonen Cc: Eric Dumazet Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/softirq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..3d6833f125d3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing for at most 2 ms, - * and if need_resched() is not set. + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. + * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in + * certain cases, such as stop_machine(), jiffies may cease to + * increment and so we need the MAX_SOFTIRQ_RESTART limit as + * well to make sure we eventually return from this method. * * These limits have been established via experimentation. * The two things to balance is latency against fairness - @@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); * should not be able to lock up the box. */ #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) +#define MAX_SOFTIRQ_RESTART 10 asmlinkage void __do_softirq(void) { @@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void) unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; + int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -265,7 +271,8 @@ restart: pending = local_softirq_pending(); if (pending) { - if (time_before(jiffies, end) && !need_resched()) + if (time_before(jiffies, end) && !need_resched() && + --max_restart) goto restart; wakeup_softirqd(); -- cgit v1.2.3 From 58e8eedf18577c7eac722d5d1f190507ea263d1b Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 23 Apr 2013 10:32:39 +0900 Subject: tracing: Fix outputting formats of x86-tsc and counter when use trace_clock Outputting formats of x86-tsc and counter should be a raw format, but after applying the patch(2b6080f28c7cc3efc8625ab71495aae89aeb63a0), the format was changed to nanosec. This is because the global variable trace_clock_id was used. When we use multiple buffers, clock_id of each sub-buffer should be used. Then, this patch uses tr->clock_id instead of the global variable trace_clock_id. [ Basically, this fixes a regression where the multibuffer code changed the trace_clock file to update tr->clock_id but the traces still use the old global trace_clock_id variable, negating the file's effect. The global trace_clock_id variable is obsolete and removed. - SR ] Link: http://lkml.kernel.org/r/20130423013239.22334.7394.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace.h | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..e71a8be4a6ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,8 +652,6 @@ static struct { ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* stop the trace while dumping if we are not opening "snapshot" */ @@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->cpu_file = tc->cpu; @@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - if (trace_clocks[trace_clock_id].in_ns) { + if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..20572ed88c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Jun 2013 14:04:36 -0700 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 55 +++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Wed, 12 Jun 2013 14:04:37 -0700 Subject: reboot: rigrate shutdown/reboot to boot cpu We recently noticed that reboot of a 1024 cpu machine takes approx 16 minutes of just stopping the cpus. The slowdown was tracked to commit f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()"). The current implementation does all the work of hot removing the cpus before halting the system. We are switching to just migrating to the boot cpu and then continuing with shutdown/reboot. This also has the effect of not breaking x86's command line parameter for specifying the reboot cpu. Note, this code was shamelessly copied from arch/x86/kernel/reboot.c with bits removed pertaining to the reboot_cpu command line parameter. Signed-off-by: Robin Holt Tested-by: Shawn Guo Cc: "Srivatsa S. Bhat" Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.2.3 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 91 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 12 Jun 2013 14:04:46 -0700 Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE audit_log_start() does wait_for_auditd() in a loop until audit_backlog_wait_time passes or audit_skb_queue has a room. If signal_pending() is true this becomes a busy-wait loop, schedule() in TASK_INTERRUPTIBLE won't block. Thanks to Guy for fully investigating and explaining the problem. (akpm: that'll cause the system to lock up on a non-preemptible uniprocessor kernel) (Guy: "Our customer was in fact running a uniprocessor machine, and they reported a system hang.") Signed-off-by: Oleg Nesterov Reported-by: Guy Streeter Cc: Eric Paris Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd3..91e53d04b6a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && -- cgit v1.2.3 From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 12 Jun 2013 14:05:07 -0700 Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from kill_rules() audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect the rule itself freed in kill_rules(). The reason is when it is killed, the 'rule' itself may have already released, we should not access it. one example: we add a rule to an inode, just at the same time the other task is deleting this inode. The work flow for adding a rule: audit_receive() -> (need audit_cmd_mutex lock) audit_receive_skb() -> audit_receive_msg() -> audit_receive_filter() -> audit_add_rule() -> audit_add_tree_rule() -> (need audit_filter_mutex lock) ... unlock audit_filter_mutex get_tree() ... iterate_mounts() -> (iterate all related inodes) tag_mount() -> tag_trunk() -> create_trunk() -> (assume it is 1st rule) fsnotify_add_mark() -> fsnotify_add_inode_mark() -> (add mark to inode->i_fsnotify_marks) ... get_tree(); (each inode will get one) ... lock audit_filter_mutex The work flow for deleting an inode: __destroy_inode() -> fsnotify_inode_delete() -> __fsnotify_inode_delete() -> fsnotify_clear_marks_by_inode() -> (get mark from inode->i_fsnotify_marks) fsnotify_destroy_mark() -> fsnotify_destroy_mark_locked() -> audit_tree_freeing_mark() -> evict_chunk() -> ... tree->goner = 1 ... kill_rules() -> (assume current->audit_context == NULL) call_rcu() -> (rule->tree != NULL) audit_free_rule_rcu() -> audit_free_rule() ... audit_schedule_prune() -> (assume current->audit_context == NULL) kthread_run() -> (need audit_cmd_mutex and audit_filter_mutex lock) prune_one() -> (delete it from prue_list) put_tree(); (match the original get_tree above) Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); -- cgit v1.2.3