From 6d21af4f7d0ab660b24c8635f4ed577f40cd2978 Mon Sep 17 00:00:00 2001 From: Javi Merino Date: Wed, 26 Oct 2011 10:16:11 +0100 Subject: irq: Fix comment typo ist->is Signed-off-by: Javi Merino Signed-off-by: Jiri Kosina --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa20308..3261c4d478a2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1281,7 +1281,7 @@ EXPORT_SYMBOL(free_irq); * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler ist + * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return -- cgit v1.2.3 From 50fb4f7fc907efff65eadb0b74387a9ffed6e849 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: fix current->state restoration race in refrigerator() refrigerator() saves current->state before entering frozen state and restores it before returning using __set_current_state(); however, this is racy, for example, please consider the following sequence. set_current_state(TASK_INTERRUPTIBLE); try_to_freeze(); if (kthread_should_stop()) break; schedule(); If kthread_stop() races with ->state restoration, the restoration can restore ->state to TASK_INTERRUPTIBLE after kthread_stop() sets it to TASK_RUNNING but kthread_should_stop() may still see zero ->should_stop because there's no memory barrier between restoring TASK_INTERRUPTIBLE and kthread_should_stop() test. This isn't restricted to kthread_should_stop(). current->state is often used in memory barrier based synchronization and silently restoring it w/o mb breaks them. Use set_current_state() instead. Signed-off-by: Tejun Heo --- kernel/freezer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 7be56c534397..3f460104a9d6 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -58,7 +58,13 @@ void refrigerator(void) current->flags &= ~PF_FREEZING; pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); + + /* + * Restore saved task state before returning. The mb'd version + * needs to be used; otherwise, it might silently break + * synchronization which depends on ordered task state change. + */ + set_current_state(save); } EXPORT_SYMBOL(refrigerator); -- cgit v1.2.3 From a0acae0e886d44bd5ce6d2f173c1ace0fcf0d9f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:22 -0800 Subject: freezer: unexport refrigerator() and update try_to_freeze() slightly There is no reason to export two functions for entering the refrigerator. Calling refrigerator() instead of try_to_freeze() doesn't save anything noticeable or removes any race condition. * Rename refrigerator() to __refrigerator() and make it return bool indicating whether it scheduled out for freezing. * Update try_to_freeze() to return bool and relay the return value of __refrigerator() if freezing(). * Convert all refrigerator() users to try_to_freeze(). * Update documentation accordingly. * While at it, add might_sleep() to try_to_freeze(). Signed-off-by: Tejun Heo Cc: Samuel Ortiz Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Steven Whitehouse Cc: Andrew Morton Cc: Jan Kara Cc: KONISHI Ryusuke Cc: Christoph Hellwig --- kernel/freezer.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 3f460104a9d6..732f14f5944f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -23,10 +23,11 @@ static inline void frozen_process(void) } /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +bool __refrigerator(void) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ + bool was_frozen = false; long save; task_lock(current); @@ -35,7 +36,7 @@ void refrigerator(void) task_unlock(current); } else { task_unlock(current); - return; + return was_frozen; } save = current->state; pr_debug("%s entered refrigerator\n", current->comm); @@ -51,6 +52,7 @@ void refrigerator(void) set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; + was_frozen = true; schedule(); } @@ -65,8 +67,10 @@ void refrigerator(void) * synchronization which depends on ordered task state change. */ set_current_state(save); + + return was_frozen; } -EXPORT_SYMBOL(refrigerator); +EXPORT_SYMBOL(__refrigerator); static void fake_signal_wake_up(struct task_struct *p) { -- cgit v1.2.3 From 8a32c441c1609f80e55df75422324a1151208f40 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: implement and use kthread_freezable_should_stop() Writeback and thinkpad_acpi have been using thaw_process() to prevent deadlock between the freezer and kthread_stop(); unfortunately, this is inherently racy - nothing prevents freezing from happening between thaw_process() and kthread_stop(). This patch implements kthread_freezable_should_stop() which enters refrigerator if necessary but is guaranteed to return if kthread_stop() is invoked. Both thaw_process() users are converted to use the new function. Note that this deadlock condition exists for many of freezable kthreads. They need to be converted to use the new should_stop or freezable workqueue. Tested with synthetic test case. Signed-off-by: Tejun Heo Acked-by: Henrique de Moraes Holschuh Cc: Jens Axboe Cc: Oleg Nesterov --- kernel/freezer.c | 6 ++++-- kernel/kthread.c | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 732f14f5944f..b83c30e9483a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * freezing is complete, mark current process as frozen @@ -23,7 +24,7 @@ static inline void frozen_process(void) } /* Refrigerator is place where frozen processes are stored :-). */ -bool __refrigerator(void) +bool __refrigerator(bool check_kthr_stop) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ @@ -50,7 +51,8 @@ bool __refrigerator(void) for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) + if (!frozen(current) || + (check_kthr_stop && kthread_should_stop())) break; was_frozen = true; schedule(); diff --git a/kernel/kthread.c b/kernel/kthread.c index b6d216a92639..1c36deaae2f1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -58,6 +58,31 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_freezable_should_stop - should this freezable kthread return now? + * @was_frozen: optional out parameter, indicates whether %current was frozen + * + * kthread_should_stop() for freezable kthreads, which will enter + * refrigerator if necessary. This function is safe from kthread_stop() / + * freezer deadlock and freezable kthreads should use this function instead + * of calling try_to_freeze() directly. + */ +bool kthread_freezable_should_stop(bool *was_frozen) +{ + bool frozen = false; + + might_sleep(); + + if (unlikely(freezing(current))) + frozen = __refrigerator(true); + + if (was_frozen) + *was_frozen = frozen; + + return kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); + /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question -- cgit v1.2.3 From a5be2d0d1a8746e7be5210e3d6b904455000443c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: rename thaw_process() to __thaw_task() and simplify the implementation thaw_process() now has only internal users - system and cgroup freezers. Remove the unnecessary return value, rename, unexport and collapse __thaw_process() into it. This will help further updates to the freezer code. -v3: oom_kill grew a use of thaw_process() while this patch was pending. Convert it to use __thaw_task() for now. In the longer term, this should be handled by allowing tasks to die if killed even if it's frozen. -v2: minor style update as suggested by Matt. Signed-off-by: Tejun Heo Cc: Paul Menage Cc: Matt Helsley --- kernel/cgroup_freezer.c | 7 +++---- kernel/freezer.c | 31 ++++++++++++------------------- kernel/power/process.c | 2 +- 3 files changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e828a2ca8e6..a6d405a86ee0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -130,7 +130,7 @@ struct cgroup_subsys freezer_subsys; * write_lock css_set_lock (cgroup iterator start) * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -300,9 +300,8 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - thaw_process(task); - } + while ((task = cgroup_iter_next(cgroup, &it))) + __thaw_task(task); cgroup_iter_end(cgroup, &it); freezer->state = CGROUP_THAWED; diff --git a/kernel/freezer.c b/kernel/freezer.c index b83c30e9483a..c851d588e29f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -145,18 +145,8 @@ void cancel_freezing(struct task_struct *p) } } -static int __thaw_process(struct task_struct *p) -{ - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - return 1; - } - clear_freeze_flag(p); - return 0; -} - /* - * Wake up a frozen process + * Wake up a frozen task * * task_lock() is needed to prevent the race with refrigerator() which may * occur if the freezing of tasks fails. Namely, without the lock, if the @@ -164,15 +154,18 @@ static int __thaw_process(struct task_struct *p) * refrigerator() could call frozen_process(), in which case the task would be * frozen and no one would thaw it. */ -int thaw_process(struct task_struct *p) +void __thaw_task(struct task_struct *p) { + bool was_frozen; + task_lock(p); - if (__thaw_process(p) == 1) { - task_unlock(p); - wake_up_process(p); - return 1; - } + was_frozen = frozen(p); + if (was_frozen) + p->flags &= ~PF_FROZEN; + else + clear_freeze_flag(p); task_unlock(p); - return 0; + + if (was_frozen) + wake_up_process(p); } -EXPORT_SYMBOL(thaw_process); diff --git a/kernel/power/process.c b/kernel/power/process.c index addbbe5531bc..fe2787207f00 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -186,7 +186,7 @@ static void thaw_tasks(bool nosig_only) if (cgroup_freezing_or_frozen(p)) continue; - thaw_process(p); + __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } -- cgit v1.2.3 From a585042f7b933539a0b6bc63650c2d49ffb2e55d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: remove racy clear_freeze_flag() and set PF_NOFREEZE on dead tasks clear_freeze_flag() in exit_mm() is racy. Freezing can start afterwards. Remove it. Skipping freezer for exiting task will be properly implemented later. Also, freezable() was testing exit_state directly to make system freezer ignore dead tasks. Let the exiting task set PF_NOFREEZE after entering TASK_DEAD instead. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/exit.c | 3 +-- kernel/power/process.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..95a4141d07e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -679,8 +679,6 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -1040,6 +1038,7 @@ NORET_TYPE void do_exit(long code) exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/power/process.c b/kernel/power/process.c index fe2787207f00..23822dc14b6c 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -25,8 +25,7 @@ static inline int freezable(struct task_struct * p) { if ((p == current) || - (p->flags & PF_NOFREEZE) || - (p->exit_state != 0)) + (p->flags & PF_NOFREEZE)) return 0; return 1; } -- cgit v1.2.3 From 6cd8dedcdd8e8de01391a7cf25f0b2afeb24f8f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: don't distinguish nosig tasks on thaw There's no point in thawing nosig tasks before others. There's no ordering requirement between the two groups on thaw, which the staged thawing can't guarantee anyway. Simplify thaw_processes() by removing the distinction and collapsing thaw_tasks() into thaw_processes(). This will help further updates to freezer. Signed-off-by: Tejun Heo --- kernel/power/process.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 23822dc14b6c..9db048fb0d70 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -170,34 +170,28 @@ int freeze_kernel_threads(void) return error; } -static void thaw_tasks(bool nosig_only) +void thaw_processes(void) { struct task_struct *g, *p; + oom_killer_enable(); + + printk("Restarting tasks ... "); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezable(p)) continue; - if (nosig_only && should_send_signal(p)) - continue; - if (cgroup_freezing_or_frozen(p)) continue; __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -} - -void thaw_processes(void) -{ - oom_killer_enable(); - printk("Restarting tasks ... "); - thaw_workqueues(); - thaw_tasks(true); - thaw_tasks(false); schedule(); printk("done.\n"); } -- cgit v1.2.3 From 0c9af09262864a2744091ee94c98c4a8fd60c98b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: use dedicated lock instead of task_lock() + memory barrier Freezer synchronization is needlessly complicated - it's by no means a hot path and the priority is staying unintrusive and safe. This patch makes it simply use a dedicated lock instead of piggy-backing on task_lock() and playing with memory barriers. On the failure path of try_to_freeze_tasks(), locking is moved from it to cancel_freezing(). This makes the frozen() test racy but the race here is a non-issue as the warning is printed for tasks which failed to enter frozen for 20 seconds and race on PF_FROZEN at the last moment doesn't change anything. This simplifies freezer implementation and eases further changes including some race fixes. Signed-off-by: Tejun Heo --- kernel/freezer.c | 84 ++++++++++++++++++++++---------------------------- kernel/power/process.c | 2 -- 2 files changed, 37 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c851d588e29f..4130e48649bb 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -11,17 +11,8 @@ #include #include -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - smp_wmb(); - } - clear_freeze_flag(current); -} +/* protects freezing and frozen transitions */ +static DEFINE_SPINLOCK(freezer_lock); /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) @@ -31,14 +22,16 @@ bool __refrigerator(bool check_kthr_stop) bool was_frozen = false; long save; - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); + spin_lock_irq(&freezer_lock); + if (!freezing(current)) { + spin_unlock_irq(&freezer_lock); return was_frozen; } + if (!(current->flags & PF_NOFREEZE)) + current->flags |= PF_FROZEN; + clear_freeze_flag(current); + spin_unlock_irq(&freezer_lock); + save = current->state; pr_debug("%s entered refrigerator\n", current->comm); @@ -99,21 +92,18 @@ static void fake_signal_wake_up(struct task_struct *p) */ bool freeze_task(struct task_struct *p, bool sig_only) { - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - smp_rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&freezer_lock, flags); + + if (sig_only && !should_send_signal(p)) + goto out_unlock; + + if (frozen(p)) + goto out_unlock; + + set_freeze_flag(p); if (should_send_signal(p)) { fake_signal_wake_up(p); @@ -123,26 +113,28 @@ bool freeze_task(struct task_struct *p, bool sig_only) * TASK_RUNNING transition can't race with task state * testing in try_to_freeze_tasks(). */ - } else if (sig_only) { - return false; } else { wake_up_state(p, TASK_INTERRUPTIBLE); } - - return true; + ret = true; +out_unlock: + spin_unlock_irqrestore(&freezer_lock, flags); + return ret; } void cancel_freezing(struct task_struct *p) { unsigned long flags; + spin_lock_irqsave(&freezer_lock, flags); if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); + spin_lock(&p->sighand->siglock); recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + spin_unlock(&p->sighand->siglock); } + spin_unlock_irqrestore(&freezer_lock, flags); } /* @@ -156,16 +148,14 @@ void cancel_freezing(struct task_struct *p) */ void __thaw_task(struct task_struct *p) { - bool was_frozen; + unsigned long flags; - task_lock(p); - was_frozen = frozen(p); - if (was_frozen) + spin_lock_irqsave(&freezer_lock, flags); + if (frozen(p)) { p->flags &= ~PF_FROZEN; - else - clear_freeze_flag(p); - task_unlock(p); - - if (was_frozen) wake_up_process(p); + } else { + clear_freeze_flag(p); + } + spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/process.c b/kernel/power/process.c index 9db048fb0d70..bd420ca48261 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -118,11 +118,9 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); if (!wakeup && freezing(p) && !freezer_should_skip(p)) sched_show_task(p); cancel_freezing(p); - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { -- cgit v1.2.3 From 6907483b4e803a20f0b48cc9afa3817420ce61c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: make freezing indicate freeze condition in effect Currently freezing (TIF_FREEZE) and frozen (PF_FROZEN) states are interlocked - freezing is set to request freeze and when the task actually freezes, it clears freezing and sets frozen. This interlocking makes things more complex than necessary - freezing doesn't mean there's freezing condition in effect and frozen doesn't match the task actually entering and leaving frozen state (it's cleared by the thawing task). This patch makes freezing indicate that freeze condition is in effect. A task enters and stays frozen if freezing. This makes PF_FROZEN manipulation done only by the task itself and prevents wakeup from __thaw_task() leaking outside of refrigerator. The only place which needs to tell freezing && !frozen is try_to_freeze_task() to whine about tasks which don't enter frozen. It's updated to test the condition explicitly. With the change, frozen() state my linger after __thaw_task() until the task wakes up and exits fridge. This can trigger BUG_ON() in update_if_frozen(). Work it around by testing freezing() && frozen() instead of frozen(). -v2: Oleg pointed out missing re-check of freezing() when trying to clear FROZEN and possible spurious BUG_ON() trigger in update_if_frozen(). Both fixed. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Paul Menage --- kernel/cgroup_freezer.c | 2 +- kernel/freezer.c | 42 ++++++++++++++++++++++++------------------ kernel/power/process.c | 3 ++- 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a6d405a86ee0..cd27b0825560 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -231,7 +231,7 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (frozen(task)) + if (freezing(task) && frozen(task)) nfrozen++; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 4130e48649bb..a8822be43da0 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,14 +22,19 @@ bool __refrigerator(bool check_kthr_stop) bool was_frozen = false; long save; + /* + * Enter FROZEN. If NOFREEZE, schedule immediate thawing by + * clearing freezing. + */ spin_lock_irq(&freezer_lock); +repeat: if (!freezing(current)) { spin_unlock_irq(&freezer_lock); return was_frozen; } - if (!(current->flags & PF_NOFREEZE)) - current->flags |= PF_FROZEN; - clear_freeze_flag(current); + if (current->flags & PF_NOFREEZE) + clear_freeze_flag(current); + current->flags |= PF_FROZEN; spin_unlock_irq(&freezer_lock); save = current->state; @@ -44,7 +49,7 @@ bool __refrigerator(bool check_kthr_stop) for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current) || + if (!freezing(current) || (check_kthr_stop && kthread_should_stop())) break; was_frozen = true; @@ -54,6 +59,13 @@ bool __refrigerator(bool check_kthr_stop) /* Remove the accounting blocker */ current->flags &= ~PF_FREEZING; + /* leave FROZEN */ + spin_lock_irq(&freezer_lock); + if (freezing(current)) + goto repeat; + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + pr_debug("%s left refrigerator\n", current->comm); /* @@ -137,25 +149,19 @@ void cancel_freezing(struct task_struct *p) spin_unlock_irqrestore(&freezer_lock, flags); } -/* - * Wake up a frozen task - * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. - */ void __thaw_task(struct task_struct *p) { unsigned long flags; + /* + * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to + * be visible to @p as waking up implies wmb. Waking up inside + * freezer_lock also prevents wakeups from leaking outside + * refrigerator. + */ spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) { - p->flags &= ~PF_FROZEN; + clear_freeze_flag(p); + if (frozen(p)) wake_up_process(p); - } else { - clear_freeze_flag(p); - } spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/process.c b/kernel/power/process.c index bd420ca48261..e6e2739190b5 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -118,7 +118,8 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!wakeup && freezing(p) && !freezer_should_skip(p)) + if (!wakeup && !freezer_should_skip(p) && + freezing(p) && !frozen(p)) sched_show_task(p); cancel_freezing(p); } while_each_thread(g, p); -- cgit v1.2.3 From 85f1d476653f52c97ca75466b2494e67c1cbd25d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: test freezable conditions while holding freezer_lock try_to_freeze_tasks() and thaw_processes() use freezable() and frozen() as preliminary tests before initiating operations on a task. These are done without any synchronization and hinder with synchronization cleanup without any real performance benefits. In try_to_freeze_tasks(), open code self test and move PF_NOFREEZE and frozen() tests inside freezer_lock in freeze_task(). thaw_processes() can simply drop freezable() test as frozen() test in __thaw_task() is enough. Note: This used to be a part of larger patch to fix set_freezable() race. Separated out to satisfy ordering among dependent fixes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/freezer.c | 3 ++- kernel/power/process.c | 16 +--------------- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index a8822be43da0..a257ecd37c48 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -109,7 +109,8 @@ bool freeze_task(struct task_struct *p, bool sig_only) spin_lock_irqsave(&freezer_lock, flags); - if (sig_only && !should_send_signal(p)) + if ((p->flags & PF_NOFREEZE) || + (sig_only && !should_send_signal(p))) goto out_unlock; if (frozen(p)) diff --git a/kernel/power/process.c b/kernel/power/process.c index e6e2739190b5..e59676f5811d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,14 +22,6 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezable(struct task_struct * p) -{ - if ((p == current) || - (p->flags & PF_NOFREEZE)) - return 0; - return 1; -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -52,10 +44,7 @@ static int try_to_freeze_tasks(bool sig_only) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezable(p)) - continue; - - if (!freeze_task(p, sig_only)) + if (p == current || !freeze_task(p, sig_only)) continue; /* @@ -181,9 +170,6 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezable(p)) - continue; - if (cgroup_freezing_or_frozen(p)) continue; -- cgit v1.2.3 From 376fede80e74d98b49d1ba9ac18f23c9fd026ddd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: kill PF_FREEZING With the previous changes, there's no meaningful difference between PF_FREEZING and PF_FROZEN. Remove PF_FREEZING and use PF_FROZEN instead in task_contributes_to_load(). Signed-off-by: Tejun Heo --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index a257ecd37c48..b8b562124ba9 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,9 +44,6 @@ repeat: recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - /* prevent accounting of that task to load */ - current->flags |= PF_FREEZING; - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!freezing(current) || @@ -56,9 +53,6 @@ repeat: schedule(); } - /* Remove the accounting blocker */ - current->flags &= ~PF_FREEZING; - /* leave FROZEN */ spin_lock_irq(&freezer_lock); if (freezing(current)) -- cgit v1.2.3 From 03afed8bc296fa70186ba832c1126228bb992465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:24 -0800 Subject: freezer: clean up freeze_processes() failure path freeze_processes() failure path is rather messy. Freezing is canceled for workqueues and tasks which aren't frozen yet but frozen tasks are left alone and should be thawed by the caller and of course some callers (xen and kexec) didn't do it. This patch updates __thaw_task() to handle cancelation correctly and makes freeze_processes() and freeze_kernel_threads() call thaw_processes() on failure instead so that the system is fully thawed on failure. Unnecessary [suspend_]thaw_processes() calls are removed from kernel/power/hibernate.c, suspend.c and user.c. While at it, restructure error checking if clause in suspend_prepare() to be less weird. -v2: Srivatsa spotted missing removal of suspend_thaw_processes() in suspend_prepare() and error in commit message. Updated. Signed-off-by: Tejun Heo Acked-by: Srivatsa S. Bhat --- kernel/freezer.c | 25 +++++++++---------------- kernel/power/hibernate.c | 15 ++------------- kernel/power/process.c | 16 ++++++++-------- kernel/power/suspend.c | 8 +++----- kernel/power/user.c | 4 +--- 5 files changed, 23 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index b8b562124ba9..11e32d419dec 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -129,21 +129,6 @@ out_unlock: return ret; } -void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock(&p->sighand->siglock); - recalc_sigpending_and_wake(p); - spin_unlock(&p->sighand->siglock); - } - spin_unlock_irqrestore(&freezer_lock, flags); -} - void __thaw_task(struct task_struct *p) { unsigned long flags; @@ -153,10 +138,18 @@ void __thaw_task(struct task_struct *p) * be visible to @p as waking up implies wmb. Waking up inside * freezer_lock also prevents wakeups from leaking outside * refrigerator. + * + * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to + * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); clear_freeze_flag(p); - if (frozen(p)) + if (frozen(p)) { wake_up_process(p); + } else { + spin_lock(&p->sighand->siglock); + recalc_sigpending_and_wake(p); + spin_unlock(&p->sighand->siglock); + } spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 196c01268ebd..ba2319ffc860 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -607,17 +607,6 @@ static void power_down(void) while(1); } -static int prepare_processes(void) -{ - int error = 0; - - if (freeze_processes()) { - error = -EBUSY; - thaw_processes(); - } - return error; -} - /** * hibernate - Carry out system hibernation, including saving the image. */ @@ -650,7 +639,7 @@ int hibernate(void) sys_sync(); printk("done.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) goto Finish; @@ -811,7 +800,7 @@ static int software_resume(void) goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); - error = prepare_processes(); + error = freeze_processes(); if (error) { swsusp_close(FMODE_READ); goto Done; diff --git a/kernel/power/process.c b/kernel/power/process.c index e59676f5811d..ce643838a00c 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -91,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs = elapsed_csecs64; if (todo) { - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ printk("\n"); printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", @@ -103,14 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - thaw_workqueues(); - read_lock(&tasklist_lock); do_each_thread(g, p) { if (!wakeup && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } else { @@ -123,6 +115,8 @@ static int try_to_freeze_tasks(bool sig_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { @@ -137,11 +131,15 @@ int freeze_processes(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_kernel_threads(void) { @@ -155,6 +153,8 @@ int freeze_kernel_threads(void) printk("\n"); BUG_ON(in_atomic()); + if (error) + thaw_processes(); return error; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4953dc054c53..d336b27d1104 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -106,13 +106,11 @@ static int suspend_prepare(void) goto Finish; error = suspend_freeze_processes(); - if (error) { - suspend_stats.failed_freeze++; - dpm_save_failed_step(SUSPEND_FREEZE); - } else + if (!error) return 0; - suspend_thaw_processes(); + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535c2b88..7cc3f5bc5c24 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -257,10 +257,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; error = freeze_processes(); - if (error) { - thaw_processes(); + if (error) usermodehelper_enable(); - } if (!error) data->frozen = 1; break; -- cgit v1.2.3 From 22b4e111fa01a1147aa562ceaf18a752a928ef4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: cgroup_freezer: prepare for removal of TIF_FREEZE TIF_FREEZE will be removed soon and freezing() will directly test whether any freezing condition is in effect. Make the following changes in preparation. * Rename cgroup_freezing_or_frozen() to cgroup_freezing() and make it return bool. * Make cgroup_freezing() access task_freezer() under rcu read lock instead of task_lock(). This makes the state dereferencing racy against task moving to another cgroup; however, it was already racy without this change as ->state dereference wasn't synchronized. This will be later dealt with using attach hooks. * freezer->state is now set before trying to push tasks into the target state. -v2: Oleg pointed out that freeze_change_state() was setting freeze->state incorrectly to CGROUP_FROZEN instead of CGROUP_FREEZING. Fixed. -v3: Matt pointed out that setting CGROUP_FROZEN used to always invoke try_to_freeze_cgroup() regardless of the current state. Patch updated such that the actual freeze/thaw operations are always performed on invocation. This shouldn't make any difference unless something is broken. Signed-off-by: Tejun Heo Acked-by: Paul Menage Cc: Li Zefan Cc: Oleg Nesterov --- kernel/cgroup_freezer.c | 40 +++++++++++++--------------------------- kernel/power/process.c | 2 +- 2 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index cd27b0825560..e6a1b8d1b8bc 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state = task_freezer(task)->state; - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); -} + enum freezer_state state; + bool ret; -int cgroup_freezing_or_frozen(struct task_struct *task) -{ - int result; - task_lock(task); - result = __cgroup_freezing_or_frozen(task); - task_unlock(task); - return result; + rcu_read_lock(); + state = task_freezer(task)->state; + ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + rcu_read_unlock(); + + return ret; } /* @@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; * freezer_can_attach(): * cgroup_mutex (held by caller of can_attach) * - * cgroup_freezing_or_frozen(): - * task->alloc_lock (to get task's cgroup) - * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): * freezer->lock * sighand->siglock (if the cgroup is freezing) @@ -177,13 +172,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - rcu_read_lock(); - if (__cgroup_freezing_or_frozen(tsk)) { - rcu_read_unlock(); - return -EBUSY; - } - rcu_read_unlock(); - return 0; + return cgroup_freezing(tsk) ? -EBUSY : 0; } static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) @@ -279,7 +268,6 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) struct task_struct *task; unsigned int num_cant_freeze_now = 0; - freezer->state = CGROUP_FREEZING; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) @@ -303,8 +291,6 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) __thaw_task(task); cgroup_iter_end(cgroup, &it); - - freezer->state = CGROUP_THAWED; } static int freezer_change_state(struct cgroup *cgroup, @@ -318,20 +304,20 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); - if (goal_state == freezer->state) - goto out; switch (goal_state) { case CGROUP_THAWED: + freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; default: BUG(); } -out: + spin_unlock_irq(&freezer->lock); return retval; diff --git a/kernel/power/process.c b/kernel/power/process.c index ce643838a00c..9f6f5c755cfa 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -170,7 +170,7 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (cgroup_freezing_or_frozen(p)) + if (cgroup_freezing(p)) continue; __thaw_task(p); -- cgit v1.2.3 From a3201227f803ad7fd43180c5195dbe5a2bf998aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE Using TIF_FREEZE for freezing worked when there was only single freezing condition (the PM one); however, now there is also the cgroup_freezer and single bit flag is getting clumsy. thaw_processes() is already testing whether cgroup freezing in in effect to avoid thawing tasks which were frozen by both PM and cgroup freezers. This is racy (nothing prevents race against cgroup freezing) and fragile. A much simpler way is to test actual freeze conditions from freezing() - ie. directly test whether PM or cgroup freezing is in effect. This patch adds variables to indicate whether and what type of freezing conditions are in effect and reimplements freezing() such that it directly tests whether any of the two freezing conditions is active and the task should freeze. On fast path, freezing() is still very cheap - it only tests system_freezing_cnt. This makes the clumsy dancing aroung TIF_FREEZE unnecessary and freeze/thaw operations more usual - updating state variables for the new state and nudging target tasks so that they notice the new state and comply. As long as the nudging happens after state update, it's race-free. * This allows use of freezing() in freeze_task(). Replace the open coded tests with freezing(). * p != current test is added to warning printing conditions in try_to_freeze_tasks() failure path. This is necessary as freezing() is now true for the task which initiated freezing too. -v2: Oleg pointed out that re-freezing FROZEN cgroup could increment system_freezing_cnt. Fixed. Signed-off-by: Tejun Heo Acked-by: Paul Menage (for the cgroup portions) --- kernel/cgroup_freezer.c | 10 +++++++- kernel/fork.c | 1 - kernel/freezer.c | 62 +++++++++++++++++++++++++++++++------------------ kernel/power/process.c | 15 ++++++++---- 4 files changed, 60 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e6a1b8d1b8bc..2327ad11725f 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -145,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, static void freezer_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup) { - kfree(cgroup_freezer(cgroup)); + struct freezer *freezer = cgroup_freezer(cgroup); + + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + kfree(freezer); } /* @@ -307,10 +311,14 @@ static int freezer_change_state(struct cgroup *cgroup, switch (goal_state) { case CGROUP_THAWED: + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); freezer->state = CGROUP_THAWED; unfreeze_cgroup(cgroup, freezer); break; case CGROUP_FROZEN: + if (freezer->state == CGROUP_THAWED) + atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; retval = try_to_freeze_cgroup(cgroup, freezer); break; diff --git a/kernel/fork.c b/kernel/fork.c index ba0d17261329..d53316e88d9d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -997,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; - clear_freeze_flag(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) diff --git a/kernel/freezer.c b/kernel/freezer.c index 11e32d419dec..f53cd5aa5b2e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -11,9 +11,41 @@ #include #include +/* total number of freezing conditions in effect */ +atomic_t system_freezing_cnt = ATOMIC_INIT(0); +EXPORT_SYMBOL(system_freezing_cnt); + +/* indicate whether PM freezing is in effect, protected by pm_mutex */ +bool pm_freezing; +bool pm_nosig_freezing; + /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); +/** + * freezing_slow_path - slow path for testing whether a task needs to be frozen + * @p: task to be tested + * + * This function is called by freezing() if system_freezing_cnt isn't zero + * and tests whether @p needs to enter and stay in frozen state. Can be + * called under any context. The freezers are responsible for ensuring the + * target tasks see the updated state. + */ +bool freezing_slow_path(struct task_struct *p) +{ + if (p->flags & PF_NOFREEZE) + return false; + + if (pm_nosig_freezing || cgroup_freezing(p)) + return true; + + if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG)) + return true; + + return false; +} +EXPORT_SYMBOL(freezing_slow_path); + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { @@ -23,17 +55,11 @@ bool __refrigerator(bool check_kthr_stop) long save; /* - * Enter FROZEN. If NOFREEZE, schedule immediate thawing by - * clearing freezing. + * No point in checking freezing() again - the caller already did. + * Proceed to enter FROZEN. */ spin_lock_irq(&freezer_lock); repeat: - if (!freezing(current)) { - spin_unlock_irq(&freezer_lock); - return was_frozen; - } - if (current->flags & PF_NOFREEZE) - clear_freeze_flag(current); current->flags |= PF_FROZEN; spin_unlock_irq(&freezer_lock); @@ -99,18 +125,12 @@ static void fake_signal_wake_up(struct task_struct *p) bool freeze_task(struct task_struct *p, bool sig_only) { unsigned long flags; - bool ret = false; spin_lock_irqsave(&freezer_lock, flags); - - if ((p->flags & PF_NOFREEZE) || - (sig_only && !should_send_signal(p))) - goto out_unlock; - - if (frozen(p)) - goto out_unlock; - - set_freeze_flag(p); + if (!freezing(p) || frozen(p)) { + spin_unlock_irqrestore(&freezer_lock, flags); + return false; + } if (should_send_signal(p)) { fake_signal_wake_up(p); @@ -123,10 +143,9 @@ bool freeze_task(struct task_struct *p, bool sig_only) } else { wake_up_state(p, TASK_INTERRUPTIBLE); } - ret = true; -out_unlock: + spin_unlock_irqrestore(&freezer_lock, flags); - return ret; + return true; } void __thaw_task(struct task_struct *p) @@ -143,7 +162,6 @@ void __thaw_task(struct task_struct *p) * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); - clear_freeze_flag(p); if (frozen(p)) { wake_up_process(p); } else { diff --git a/kernel/power/process.c b/kernel/power/process.c index 9f6f5c755cfa..0beb51e1dec9 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -101,7 +101,7 @@ static int try_to_freeze_tasks(bool sig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { if (!wakeup && !freezer_should_skip(p) && - freezing(p) && !frozen(p)) + p != current && freezing(p) && !frozen(p)) sched_show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -122,7 +122,11 @@ int freeze_processes(void) { int error; + if (!pm_freezing) + atomic_inc(&system_freezing_cnt); + printk("Freezing user space processes ... "); + pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) { printk("done."); @@ -146,6 +150,7 @@ int freeze_kernel_threads(void) int error; printk("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) printk("done."); @@ -162,6 +167,11 @@ void thaw_processes(void) { struct task_struct *g, *p; + if (pm_freezing) + atomic_dec(&system_freezing_cnt); + pm_freezing = false; + pm_nosig_freezing = false; + oom_killer_enable(); printk("Restarting tasks ... "); @@ -170,9 +180,6 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (cgroup_freezing(p)) - continue; - __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -- cgit v1.2.3 From 948246f70a811c872b9d93bb4a8ab5823c4c79e0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: remove should_send_signal() and update frozen() should_send_signal() is only used in freezer.c. Exporting them only increases chance of abuse. Open code the two users and remove it. Update frozen() to return bool. Signed-off-by: Tejun Heo --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index f53cd5aa5b2e..95a123844241 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -132,7 +132,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) return false; } - if (should_send_signal(p)) { + if (!(p->flags & PF_FREEZER_NOSIG)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler -- cgit v1.2.3 From 96ee6d8539c9fc6742908d85eb9723abb5c91854 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: fix set_freezable[_with_signal]() race A kthread doing set_freezable*() may race with on-going PM freeze and the freezer might think all tasks are frozen while the new freezable kthread is merrily proceeding to execute code paths which aren't supposed to be executing during PM freeze. Reimplement set_freezable[_with_signal]() using __set_freezable() such that freezable PF flags are modified under freezer_lock and try_to_freeze() is called afterwards. This eliminates race condition against freezing. Note: Separated out from larger patch to resolve fix order dependency Oleg pointed out. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/freezer.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 95a123844241..b1e7a7b3d2cd 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -171,3 +171,28 @@ void __thaw_task(struct task_struct *p) } spin_unlock_irqrestore(&freezer_lock, flags); } + +/** + * __set_freezable - make %current freezable + * @with_signal: do we want %TIF_SIGPENDING for notification too? + * + * Mark %current freezable and enter refrigerator if necessary. + */ +bool __set_freezable(bool with_signal) +{ + might_sleep(); + + /* + * Modify flags while holding freezer_lock. This ensures the + * freezer notices that we aren't frozen yet or the freezing + * condition is visible to try_to_freeze() below. + */ + spin_lock_irq(&freezer_lock); + current->flags &= ~PF_NOFREEZE; + if (with_signal) + current->flags &= ~PF_FREEZER_NOSIG; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); +} +EXPORT_SYMBOL(__set_freezable); -- cgit v1.2.3 From 5ece3eae4cdb968c269e0bc7e2c0e2b223552025 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: restructure __refrigerator() If another freeze happens before all tasks leave FROZEN state after being thawed, the freezer can see the existing FROZEN and consider the tasks to be frozen but they can clear FROZEN without checking the new freezing(). Oleg suggested restructuring __refrigerator() such that there's single condition check section inside freezer_lock and sigpending is cleared afterwards, which fixes the problem and simplifies the code. Restructure accordingly. -v2: Frozen loop exited without releasing freezer_lock. Fixed. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: "Rafael J. Wysocki" --- kernel/freezer.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index b1e7a7b3d2cd..c3496c6a91b2 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -52,39 +52,29 @@ bool __refrigerator(bool check_kthr_stop) /* Hmm, should we be allowed to suspend when there are realtime processes around? */ bool was_frozen = false; - long save; + long save = current->state; - /* - * No point in checking freezing() again - the caller already did. - * Proceed to enter FROZEN. - */ - spin_lock_irq(&freezer_lock); -repeat: - current->flags |= PF_FROZEN; - spin_unlock_irq(&freezer_lock); - - save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock_irq(&freezer_lock); + current->flags |= PF_FROZEN; if (!freezing(current) || (check_kthr_stop && kthread_should_stop())) + current->flags &= ~PF_FROZEN; + spin_unlock_irq(&freezer_lock); + + if (!(current->flags & PF_FROZEN)) break; was_frozen = true; schedule(); } - /* leave FROZEN */ - spin_lock_irq(&freezer_lock); - if (freezing(current)) - goto repeat; - current->flags &= ~PF_FROZEN; - spin_unlock_irq(&freezer_lock); + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); pr_debug("%s left refrigerator\n", current->comm); -- cgit v1.2.3 From 37ad8aca94a1da2112a7c56151390914e80d1113 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: use lock_task_sighand() in fake_signal_wake_up() cgroup_freezer calls freeze_task() without holding tasklist_lock and, if the task is exiting, its ->sighand may be gone by the time fake_signal_wake_up() is called. Use lock_task_sighand() instead of accessing ->sighand directly. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: "Rafael J. Wysocki" Cc: Paul Menage --- kernel/freezer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c3496c6a91b2..389549f0a94e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -93,9 +93,10 @@ static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + if (lock_task_sighand(p, &flags)) { + signal_wake_up(p, 0); + unlock_task_sighand(p, &flags); + } } /** -- cgit v1.2.3 From 839e3407d90a810318d17c17ceb3d5928a910704 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:26 -0800 Subject: freezer: remove unused @sig_only from freeze_task() After "freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE", freezing() returns authoritative answer on whether the current task should freeze or not and freeze_task() doesn't need or use @sig_only. Remove it. While at it, rewrite function comment for freeze_task() and rename @sig_only to @user_only in try_to_freeze_tasks(). This patch doesn't cause any functional change. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/cgroup_freezer.c | 4 ++-- kernel/freezer.c | 21 +++++++++------------ kernel/power/process.c | 8 ++++---- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2327ad11725f..e411a60cc2c8 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -206,7 +206,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) - freeze_task(task, true); + freeze_task(task); spin_unlock_irq(&freezer->lock); } @@ -274,7 +274,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task, true)) + if (!freeze_task(task)) continue; if (frozen(task)) continue; diff --git a/kernel/freezer.c b/kernel/freezer.c index 389549f0a94e..2589a61de44c 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -100,20 +100,17 @@ static void fake_signal_wake_up(struct task_struct *p) } /** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * freeze_task - send a freeze request to given task + * @p: task to send the request to * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. + * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE + * flag and either sending a fake signal to it or waking it up, depending + * on whether it has %PF_FREEZER_NOSIG set. + * + * RETURNS: + * %false, if @p is not freezing or already frozen; %true, otherwise */ -bool freeze_task(struct task_struct *p, bool sig_only) +bool freeze_task(struct task_struct *p) { unsigned long flags; diff --git a/kernel/power/process.c b/kernel/power/process.c index 0beb51e1dec9..77274c9ba2f1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,7 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static int try_to_freeze_tasks(bool sig_only) +static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; @@ -37,14 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) end_time = jiffies + TIMEOUT; - if (!sig_only) + if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (p == current || !freeze_task(p, sig_only)) + if (p == current || !freeze_task(p)) continue; /* @@ -65,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) } while_each_thread(g, p); read_unlock(&tasklist_lock); - if (!sig_only) { + if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } -- cgit v1.2.3 From 34b087e48367c252e343c2f8de65676a78af1e4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Nov 2011 09:28:17 -0800 Subject: freezer: kill unused set_freezable_with_signal() There's no in-kernel user of set_freezable_with_signal() left. Mixing TIF_SIGPENDING with kernel threads can lead to nasty corner cases as kernel threads never travel signal delivery path on their own. e.g. the current implementation is buggy in the cancelation path of __thaw_task(). It calls recalc_sigpending_and_wake() in an attempt to clear TIF_SIGPENDING but the function never clears it regardless of sigpending state. This means that signallable freezable kthreads may continue executing with !freezing() && stuck TIF_SIGPENDING, which can be troublesome. This patch removes set_freezable_with_signal() along with PF_FREEZER_NOSIG and recalc_sigpending*() calls in freezer. User tasks get TIF_SIGPENDING, kernel tasks get woken up and the spurious sigpending is dealt with in the usual signal delivery path. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/freezer.c | 27 ++++++--------------------- kernel/kthread.c | 2 +- 2 files changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 2589a61de44c..9815b8d1eed5 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -39,7 +39,7 @@ bool freezing_slow_path(struct task_struct *p) if (pm_nosig_freezing || cgroup_freezing(p)) return true; - if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG)) + if (pm_freezing && !(p->flags & PF_KTHREAD)) return true; return false; @@ -72,10 +72,6 @@ bool __refrigerator(bool check_kthr_stop) schedule(); } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - pr_debug("%s left refrigerator\n", current->comm); /* @@ -120,7 +116,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_FREEZER_NOSIG)) { + if (!(p->flags & PF_KTHREAD)) { fake_signal_wake_up(p); /* * fake_signal_wake_up() goes through p's scheduler @@ -145,28 +141,19 @@ void __thaw_task(struct task_struct *p) * be visible to @p as waking up implies wmb. Waking up inside * freezer_lock also prevents wakeups from leaking outside * refrigerator. - * - * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to - * avoid leaving dangling TIF_SIGPENDING behind. */ spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) { + if (frozen(p)) wake_up_process(p); - } else { - spin_lock(&p->sighand->siglock); - recalc_sigpending_and_wake(p); - spin_unlock(&p->sighand->siglock); - } spin_unlock_irqrestore(&freezer_lock, flags); } /** - * __set_freezable - make %current freezable - * @with_signal: do we want %TIF_SIGPENDING for notification too? + * set_freezable - make %current freezable * * Mark %current freezable and enter refrigerator if necessary. */ -bool __set_freezable(bool with_signal) +bool set_freezable(void) { might_sleep(); @@ -177,10 +164,8 @@ bool __set_freezable(bool with_signal) */ spin_lock_irq(&freezer_lock); current->flags &= ~PF_NOFREEZE; - if (with_signal) - current->flags &= ~PF_FREEZER_NOSIG; spin_unlock_irq(&freezer_lock); return try_to_freeze(); } -EXPORT_SYMBOL(__set_freezable); +EXPORT_SYMBOL(set_freezable); diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c36deaae2f1..3d3de633702e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -282,7 +282,7 @@ int kthreadd(void *unused) set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_states[N_HIGH_MEMORY]); - current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; + current->flags |= PF_NOFREEZE; for (;;) { set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 341d4166175e9b7911444f5a33b1c9efb8f15c85 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 19 Nov 2011 14:39:01 +0100 Subject: PM: Fix indentation and remove extraneous whitespaces in kernel/power/main.c Lack of proper indentation of the goto statement decreases the readability of code significantly. In fact, this made me look twice at the code to check whether it really does what it should be doing. Fix this. And in the same file, there are some extra whitespaces. Get rid of them too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 36e0f0903c32..7d36fb31e4c4 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * + * * This file is released under the GPLv2 * */ @@ -240,7 +240,7 @@ struct kobject *power_kobj; * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and * 'disk' (Suspend-to-Disk). * - * store() accepts one of those strings, translates it into the + * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, /* First, check if we are requested to hibernate */ if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); - goto Exit; + goto Exit; } #ifdef CONFIG_SUSPEND -- cgit v1.2.3 From 953a206393b1533ceb0e7d725cc5a8c8d7ed97dd Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 22 Nov 2011 23:20:31 +0100 Subject: PM / Hibernate: Refactor and simplify hibernation_snapshot() code The goto statements in hibernation_snapshot() are a bit complex. Refactor the code to remove some of them, thereby simplifying the implementation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a6b0503574ee..ebf62c3bc9f7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -333,7 +333,7 @@ static int create_image(int platform_mode) */ int hibernation_snapshot(int platform_mode) { - pm_message_t msg = PMSG_RECOVER; + pm_message_t msg; int error; error = platform_begin(platform_mode); @@ -362,26 +362,26 @@ int hibernation_snapshot(int platform_mode) error = dpm_prepare(PMSG_FREEZE); if (error) { - dpm_complete(msg); + dpm_complete(PMSG_RECOVER); goto Cleanup; } suspend_console(); pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); - if (error) - goto Recover_platform; - if (hibernation_test(TEST_DEVICES)) - goto Recover_platform; + if (error || hibernation_test(TEST_DEVICES)) + platform_recover(platform_mode); + else + error = create_image(platform_mode); - error = create_image(platform_mode); /* - * Control returns here (1) after the image has been created or the + * In the case that we call create_image() above, the control + * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ - Resume_devices: /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); @@ -399,10 +399,6 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; - Recover_platform: - platform_recover(platform_mode); - goto Resume_devices; - Cleanup: swsusp_free(); goto Close; -- cgit v1.2.3 From 5307427a31c50041cc4d18c49e9cce1a9303ea04 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 15 Nov 2011 21:59:21 +0100 Subject: PM / Usermodehelper: Cleanup remnants of usermodehelper_pm_callback() usermodehelper_pm_callback() no longer exists in the kernel. There are 2 comments in kernel/kmod.c that still refer to it. Also, the patch that introduced usermodehelper_pm_callback(), #included two header files: and . But these are no longer necessary. This patch updates the comments as appropriate and removes the unnecessary header file inclusions. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/kmod.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97c75b6..2142687094d3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -34,8 +34,6 @@ #include #include #include -#include -#include #include #include @@ -282,14 +280,14 @@ static int usermodehelper_disabled = 1; static atomic_t running_helpers = ATOMIC_INIT(0); /* - * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_pm_callback() fails + * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -- cgit v1.2.3 From d7268a31c8aabc5a29ccc7f76de84383e5f38737 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 15 Nov 2011 21:59:31 +0100 Subject: CPU: Add right qualifiers for alloc_frozen_cpus() and cpu_hotplug_pm_sync_init() Add __init for functions alloc_frozen_cpus() and cpu_hotplug_pm_sync_init() because they are only called during boot time. Add static for function cpu_hotplug_pm_sync_init() because its scope is limited in this file only. Signed-off-by: Fenghua Yu Acked-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 563f13609470..cf915b86a5fb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -470,7 +470,7 @@ out: cpu_maps_update_done(); } -static int alloc_frozen_cpus(void) +static int __init alloc_frozen_cpus(void) { if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) return -ENOMEM; @@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, } -int cpu_hotplug_pm_sync_init(void) +static int __init cpu_hotplug_pm_sync_init(void) { pm_notifier(cpu_hotplug_pm_callback, 0); return 0; -- cgit v1.2.3 From a13b032776379fa6e2bfccf798969ca51e5fb052 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 24 Nov 2011 12:27:26 +0100 Subject: clockevents: drop unknown Kconfig symbol GENERIC_CLOCKEVENTS_MIGR There's no Kconfig symbol GENERIC_CLOCKEVENTS_MIGR, so the check for it will always fail. Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe92..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -25,7 +25,7 @@ config HIGH_RES_TIMERS config GENERIC_CLOCKEVENTS_BUILD bool default y - depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + depends on GENERIC_CLOCKEVENTS config GENERIC_CLOCKEVENTS_MIN_ADJUST bool -- cgit v1.2.3 From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:36 +0900 Subject: x86: Panic on detection of stack overflow Currently, messages are just output on the detection of stack overflow, which is not sufficient for systems that need a high reliability. This is because in general the overflow may corrupt data, and the additional corruption may occur due to reading them unless systems stop. This patch adds the sysctl parameter kernel.panic_on_stackoverflow and causes a panic when detecting the overflows of kernel, IRQ and exception stacks except user stack according to the parameter. It is disabled by default. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae2719643854..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, -- cgit v1.2.3 From 0118521cc7acb3ccbc1a01d6144ac32be9d56a4c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:32:43 +0100 Subject: PM / Hibernate: Enable usermodehelpers in software_resume() error path In the software_resume() function defined in kernel/power/hibernate.c, if the call to create_basic_memory_bitmaps() fails, the usermodehelpers are not enabled (which had been disabled in the previous step). Fix it. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index ebf62c3bc9f7..1fcf9de4506d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -807,8 +807,10 @@ static int software_resume(void) goto close_finish; error = create_basic_memory_bitmaps(); - if (error) + if (error) { + usermodehelper_enable(); goto close_finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); -- cgit v1.2.3 From 97819a26224f019e73d88bb2fd4eb5a614860461 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:33:10 +0100 Subject: PM / Hibernate: Thaw processes in SNAPSHOT_CREATE_IMAGE ioctl test path Commit 2aede851ddf08666f68ffc17be446420e9d2a056 (PM / Hibernate: Freeze kernel threads after preallocating memory) moved the freezing of kernel threads to hibernation_snapshot() function. So now, if the call to hibernation_snapshot() returns early due to a successful hibernation test, the caller has to thaw processes to ensure that the system gets back to its original state. But in SNAPSHOT_CREATE_IMAGE hibernation ioctl, the caller does not thaw processes in case hibernation_snapshot() returned due to a successful freezer test. Fix this issue. But note we still send the value of 'in_suspend' (which is now 0) to userspace, because we are not in an error path per-se, and moreover, the value of in_suspend correctly depicts the situation here. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- kernel/power/power.h | 2 ++ kernel/power/user.c | 11 ++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1fcf9de4506d..c10cb0f916e2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -55,7 +55,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static bool freezer_test_done; +bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; diff --git a/kernel/power/power.h b/kernel/power/power.h index 23a2db1ec442..0c4defe6d3b8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) /* kernel/power/hibernate.c */ +extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535c2b88..e2aff0fc2697 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -283,10 +283,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error) - data->ready = 1; + if (!error && !freezer_test_done) + data->ready = 1; + if (freezer_test_done) { + freezer_test_done = false; + thaw_processes(); + } + } break; case SNAPSHOT_ATOMIC_RESTORE: -- cgit v1.2.3 From 48580ab8729865c81e148d59159fbe2aa7865511 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 1 Dec 2011 22:33:20 +0100 Subject: PM / Hibernate: Remove deprecated hibernation test modes The hibernation test modes 'test' and 'testproc' are deprecated, because the 'pm_test' framework offers much more fine-grained control for debugging suspend and hibernation related problems. So, remove the deprecated 'test' and 'testproc' hibernation test modes. Suggested-by: Rafael J. Wysocki Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c10cb0f916e2..5314a94a92c1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -43,8 +43,6 @@ int in_suspend __nosavedata; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, - HIBERNATION_TEST, - HIBERNATION_TESTPROC, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, /* keep last */ @@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void) mdelay(5000); } -static int hibernation_testmode(int mode) -{ - if (hibernation_mode == mode) { - hibernation_debug_sleep(); - return 1; - } - return 0; -} - static int hibernation_test(int level) { if (pm_test_level == level) { @@ -114,7 +103,6 @@ static int hibernation_test(int level) return 0; } #else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; } static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ @@ -278,8 +266,7 @@ static int create_image(int platform_mode) goto Platform_finish; error = disable_nonboot_cpus(); - if (error || hibernation_test(TEST_CPUS) - || hibernation_testmode(HIBERNATION_TEST)) + if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); @@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode) if (error) goto Cleanup; - if (hibernation_test(TEST_FREEZER) || - hibernation_testmode(HIBERNATION_TESTPROC)) { + if (hibernation_test(TEST_FREEZER)) { /* * Indicate to the caller that we are returning due to a @@ -586,9 +572,6 @@ int hibernation_platform_enter(void) static void power_down(void) { switch (hibernation_mode) { - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: - break; case HIBERNATION_REBOOT: kernel_restart(NULL); break; @@ -853,8 +836,6 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", - [HIBERNATION_TEST] = "test", - [HIBERNATION_TESTPROC] = "testproc", }; /* @@ -863,17 +844,15 @@ static const char * const hibernation_modes[] = { * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly), or using one of the two available test modes. + * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes - * to be printed. There are 5 modes that can be supported: + * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' - * 'test' - * 'testproc' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. @@ -897,8 +876,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -939,8 +916,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: - case HIBERNATION_TEST: - case HIBERNATION_TESTPROC: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: -- cgit v1.2.3 From e5b16746f0f2d6883c226af52d90904ce0f7eee8 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 3 Dec 2011 00:20:30 +0100 Subject: PM / Hibernate: Replace unintuitive 'if' condition in kernel/power/user.c with 'else' In the snapshot_ioctl() function, under SNAPSHOT_FREEZE, the code below freeze_processes() is a bit unintuitive. Improve it by replacing the second 'if' condition with an 'else' clause. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index c202e2e1a2d5..06ea33df8560 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -259,7 +259,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = freeze_processes(); if (error) usermodehelper_enable(); - if (!error) + else data->frozen = 1; break; -- cgit v1.2.3 From bcda53faf5814c0c6025a0bd47108adfcbe9f199 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 7 Dec 2011 22:29:54 +0100 Subject: PM / Sleep: Replace mutex_[un]lock(&pm_mutex) with [un]lock_system_sleep() Using [un]lock_system_sleep() is safer than directly using mutex_[un]lock() on 'pm_mutex', since the latter could lead to freezing failures. Hence convert all the present users of mutex_[un]lock(&pm_mutex) to use these safe APIs instead. Suggested-by: Tejun Heo Signed-off-by: Srivatsa S. Bhat Reviewed-by: Simon Horman Signed-off-by: Rafael J. Wysocki --- kernel/kexec.c | 4 ++-- kernel/power/hibernate.c | 16 ++++++++-------- kernel/power/main.c | 4 ++-- kernel/power/suspend.c | 4 ++-- kernel/power/user.c | 16 ++++++++-------- 5 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index dc7bc0829286..090ee10d9604 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1523,7 +1523,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - mutex_lock(&pm_mutex); + lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1576,7 +1576,7 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 605149a6d219..6d6d28870335 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -69,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) WARN_ON(1); return; } - mutex_lock(&pm_mutex); + lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } static bool entering_platform_hibernation; @@ -597,7 +597,7 @@ int hibernate(void) { int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -665,7 +665,7 @@ int hibernate(void) pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -893,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -919,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pr_debug("PM: Hibernation mode set to '%s'\n", hibernation_modes[mode]); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } @@ -946,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (maj != MAJOR(res) || min != MINOR(res)) goto out; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_resume_device = res; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7d36fb31e4c4..9824b41e5a18 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - mutex_lock(&pm_mutex); + lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error ? error : n; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d336b27d1104..4fd51beed879 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - mutex_lock(&pm_mutex); + lock_system_sleep(); suspend_ops = ops; - mutex_unlock(&pm_mutex); + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); diff --git a/kernel/power/user.c b/kernel/power/user.c index 06ea33df8560..98ade217da6c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; - mutex_lock(&pm_mutex); + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; @@ -123,7 +123,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->platform_support = 0; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return error; } @@ -132,7 +132,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; - mutex_lock(&pm_mutex); + lock_system_sleep(); swsusp_free(); free_basic_memory_bitmaps(); @@ -146,7 +146,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return 0; } @@ -158,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -179,7 +179,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } @@ -191,7 +191,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; - mutex_lock(&pm_mutex); + lock_system_sleep(); data = filp->private_data; @@ -208,7 +208,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - mutex_unlock(&pm_mutex); + unlock_system_sleep(); return res; } -- cgit v1.2.3 From b298d289c79211508f11cb50749b0d1d54eb244a Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 9 Dec 2011 23:36:36 +0100 Subject: PM / Sleep: Fix freezer failures due to racy usermodehelper_is_disabled() Commit a144c6a (PM: Print a warning if firmware is requested when tasks are frozen) introduced usermodehelper_is_disabled() to warn and exit immediately if firmware is requested when usermodehelpers are disabled. However, it is racy. Consider the following scenario, currently used in drivers/base/firmware_class.c: ... if (usermodehelper_is_disabled()) goto out; /* Do actual work */ ... out: return err; Nothing prevents someone from disabling usermodehelpers just after the check in the 'if' condition, which means that it is quite possible to try doing the "actual work" with usermodehelpers disabled, leading to undesirable consequences. In particular, this race condition in _request_firmware() causes task freezing failures whenever suspend/hibernation is in progress because, it wrongly waits to get the firmware/microcode image from userspace when actually the usermodehelpers are disabled or userspace has been frozen. Some of the example scenarios that cause freezing failures due to this race are those that depend on userspace via request_firmware(), such as x86 microcode module initialization and microcode image reload. Previous discussions about this issue can be found at: http://thread.gmane.org/gmane.linux.kernel/1198291/focus=1200591 This patch adds proper synchronization to fix this issue. It is to be noted that this patchset fixes the freezing failures but doesn't remove the warnings. IOW, it does not attempt to add explicit synchronization to x86 microcode driver to avoid requesting microcode image at inopportune moments. Because, the warnings were introduced to highlight such cases, in the first place. And we need not silence the warnings, since we take care of the *real* problem (freezing failure) and hence, after that, the warnings are pretty harmless anyway. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/kmod.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97c75b6..81b4a27261b2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES @@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. */ static int usermodehelper_disabled = 1; @@ -293,6 +296,18 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) +void read_lock_usermodehelper(void) +{ + down_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_lock_usermodehelper); + +void read_unlock_usermodehelper(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); + /** * usermodehelper_disable - prevent new helpers from being started */ @@ -300,8 +315,10 @@ int usermodehelper_disable(void) { long retval; + down_write(&umhelper_sem); usermodehelper_disabled = 1; - smp_mb(); + up_write(&umhelper_sem); + /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to @@ -314,7 +331,9 @@ int usermodehelper_disable(void) if (retval) return 0; + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); return -EAGAIN; } @@ -323,7 +342,9 @@ int usermodehelper_disable(void) */ void usermodehelper_enable(void) { + down_write(&umhelper_sem); usermodehelper_disabled = 0; + up_write(&umhelper_sem); } /** -- cgit v1.2.3 From cf007e3526a785a95a738d5a8fba44f1f4fe33e0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 8 Dec 2011 23:42:53 +0100 Subject: PM / Hibernate: Remove deprecated hibernation snapshot ioctls Several snapshot ioctls were marked for removal quite some time ago, since they were deprecated. Remove them. Suggested-by: Rafael J. Wysocki Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 87 ----------------------------------------------------- 1 file changed, 87 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 98ade217da6c..78bdb4404aab 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -30,28 +30,6 @@ #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future. They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE 1 -#define PMOPS_ENTER 2 -#define PMOPS_FINISH 3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones. They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) - #define SNAPSHOT_MINOR 231 @@ -213,15 +191,6 @@ unlock: return res; } -static void snapshot_deprecated_ioctl(unsigned int cmd) -{ - if (printk_ratelimit()) - printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " - "be removed soon, update your suspend-to-disk " - "utilities\n", - __builtin_return_address(0), cmd); -} - static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -272,8 +241,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_ATOMIC_SNAPSHOT: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; @@ -308,8 +275,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_SET_IMAGE_SIZE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -324,16 +289,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_GET_SWAP_PAGE: - snapshot_deprecated_ioctl(cmd); case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; @@ -356,27 +317,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_all_swap_pages(data->swap); break; - case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - if (!swsusp_swap_in_use()) { - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg), - 0, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } else { - error = -EPERM; - } - break; - case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; @@ -399,33 +339,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_platform_enter(); break; - case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ - snapshot_deprecated_ioctl(cmd); - error = -EINVAL; - - switch (arg) { - - case PMOPS_PREPARE: - data->platform_support = 1; - error = 0; - break; - - case PMOPS_ENTER: - if (data->platform_support) - error = hibernation_platform_enter(); - break; - - case PMOPS_FINISH: - if (data->platform_support) - error = 0; - break; - - default: - printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - - } - break; - case SNAPSHOT_SET_SWAP_AREA: if (swsusp_swap_in_use()) { error = -EPERM; -- cgit v1.2.3 From e25e2cbb4c6679bed5f52fb0f2cc381688297901 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: add cgroup_root_mutex cgroup wants to make threadgroup stable while modifying cgroup hierarchies which will introduce locking dependency on cred_guard_mutex from cgroup_mutex. This unfortunately completes circular dependency. A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem B. namespace_sem -> cgroup_mutex B is from cgroup_show_options() and this patch breaks it by introducing another mutex cgroup_root_mutex which nests inside cgroup_mutex and protects cgroupfs_root. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Oleg Nesterov --- kernel/cgroup.c | 64 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9d5648f3cdc..6545fd61b10d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,7 +63,24 @@ #include +/* + * cgroup_mutex is the master lock. Any modification to cgroup or its + * hierarchy must be performed while holding it. + * + * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify + * cgroupfs_root of any cgroup hierarchy - subsys list, flags, + * release_agent_path and so on. Modifying requires both cgroup_mutex and + * cgroup_root_mutex. Readers can acquire either of the two. This is to + * break the following locking order cycle. + * + * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem + * B. namespace_sem -> cgroup_mutex + * + * B happens only through cgroup_show_options() and using cgroup_root_mutex + * breaks it. + */ static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_MUTEX(cgroup_root_mutex); /* * Generate an array of cgroup subsystem pointers. At boot time, this is @@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); + BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; @@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (test_bit(ROOT_NOPREFIX, &root->flags)) @@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) out_unlock: kfree(opts.release_agent); kfree(opts.name); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; @@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct inode *inode; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* We used the new root structure, so this is a new hierarchy */ struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; - struct inode *inode; struct cgroupfs_root *existing_root; const struct cred *cred; int i; @@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - if (strlen(root->name)) { - /* Check for name clashes with existing mounts */ - for_each_active_root(existing_root) { - if (!strcmp(existing_root->name, root->name)) { - ret = -EBUSY; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } - } - } + /* Check for name clashes with existing mounts */ + ret = -EBUSY; + if (strlen(root->name)) + for_each_active_root(existing_root) + if (!strcmp(existing_root->name, root->name)) + goto unlock_drop; /* * We're accessing css_set_count without locking @@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * have some link structures left over */ ret = allocate_cg_links(css_set_count, &tmp_cg_links); - if (ret) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - goto drop_new_super; - } + if (ret) + goto unlock_drop; ret = rebind_subsystems(root, root->subsys_bits); if (ret == -EBUSY) { - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); free_cg_links(&tmp_cg_links); - goto drop_new_super; + goto unlock_drop; } /* * There must be no failure case after here, since rebinding @@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); revert_creds(cred); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { @@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + unlock_drop: + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); drop_modules: @@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ ret = rebind_subsystems(root, 0); @@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); kill_litter_super(sb); @@ -2311,7 +2329,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); + mutex_unlock(&cgroup_root_mutex); cgroup_unlock(); return 0; } -- cgit v1.2.3 From 257058ae2b971646b96ab3a15605ac69186e562a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: threadgroup: rename signal->threadgroup_fork_lock to ->group_rwsem Make the following renames to prepare for extension of threadgroup locking. * s/signal->threadgroup_fork_lock/signal->group_rwsem/ * s/threadgroup_fork_read_lock()/threadgroup_change_begin()/ * s/threadgroup_fork_read_unlock()/threadgroup_change_end()/ * s/threadgroup_fork_write_lock()/threadgroup_lock()/ * s/threadgroup_fork_write_unlock()/threadgroup_unlock()/ This patch doesn't cause any behavior change. -v2: Rename threadgroup_change_done() to threadgroup_change_end() per KAMEZAWA's suggestion. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 13 ++++++------- kernel/fork.c | 8 ++++---- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6545fd61b10d..b409df3b2e9d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2003,8 +2003,8 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, * @cgrp: the cgroup to attach to * @leader: the threadgroup leader task_struct of the group to be attached * - * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will - * take task_lock of each thread in leader's threadgroup individually in turn. + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of each thread in leader's threadgroup individually in turn. */ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { @@ -2030,8 +2030,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 0: in order to do expensive, possibly blocking operations for * every thread, we cannot iterate the thread group list, since it needs * rcu or tasklist locked. instead, build an array of all threads in the - * group - threadgroup_fork_lock prevents new threads from appearing, - * and if threads exit, this will just be an over-estimate. + * group - group_rwsem prevents new threads from appearing, and if + * threads exit, this will just be an over-estimate. */ group_size = get_nr_threads(leader); /* flex_array supports very large thread-groups better than kmalloc. */ @@ -2249,7 +2249,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cgroup_unlock(); return -ESRCH; } - /* * even if we're attaching all tasks in the thread group, we * only need to check permissions on one of them. @@ -2273,9 +2272,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) } if (threadgroup) { - threadgroup_fork_write_lock(tsk); + threadgroup_lock(tsk); ret = cgroup_attach_proc(cgrp, tsk); - threadgroup_fork_write_unlock(tsk); + threadgroup_unlock(tsk); } else { ret = cgroup_attach_task(cgrp, tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index 827808613847..d4ac9e3e0075 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -972,7 +972,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sched_autogroup_fork(sig); #ifdef CONFIG_CGROUPS - init_rwsem(&sig->threadgroup_fork_lock); + init_rwsem(&sig->group_rwsem); #endif sig->oom_adj = current->signal->oom_adj; @@ -1157,7 +1157,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->io_context = NULL; p->audit_context = NULL; if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_lock(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1372,7 +1372,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); perf_event_fork(p); return p; @@ -1407,7 +1407,7 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) - threadgroup_fork_read_unlock(current); + threadgroup_change_end(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); -- cgit v1.2.3 From 77e4ef99d1c596a31747668e5fd837f77b6349b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage Cc: Linus Torvalds --- kernel/signal.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b3f78d09a105..399c184bf0ae 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2359,8 +2359,15 @@ void exit_signals(struct task_struct *tsk) int group_stop = 0; sigset_t unblocked; + /* + * @tsk is about to have PF_EXITING set - lock out users which + * expect stable threadgroup. + */ + threadgroup_change_begin(tsk); + if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; + threadgroup_change_end(tsk); return; } @@ -2370,6 +2377,9 @@ void exit_signals(struct task_struct *tsk) * see wants_signal(), do_signal_stop(). */ tsk->flags |= PF_EXITING; + + threadgroup_change_end(tsk); + if (!signal_pending(tsk)) goto out; -- cgit v1.2.3 From cd3d095275374220921fcf0d4e0c16584b26ddbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: always lock threadgroup during migration Update cgroup to take advantage of the fack that threadgroup_lock() guarantees stable threadgroup. * Lock threadgroup even if the target is a single task. This guarantees that when the target tasks stay stable during migration regardless of the target type. * Remove PF_EXITING early exit optimization from attach_task_by_pid() and check it in cgroup_task_migrate() instead. The optimization was for rather cold path to begin with and PF_EXITING state can be trusted throughout migration by checking it after locking threadgroup. * Don't add PF_EXITING tasks to target task array in cgroup_attach_proc(). This ensures that task migration is performed only for live tasks. * Remove -ESRCH failure path from cgroup_task_migrate(). With the above changes, it's guaranteed to be called only for live tasks. After the changes, only live tasks are migrated and they're guaranteed to stay alive until migration is complete. This removes problems caused by exec and exit racing against cgroup migration including symmetry among cgroup attach methods and different cgroup methods racing each other. v2: Oleg pointed out that one more PF_EXITING check can be removed from cgroup_attach_proc(). Removed. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Li Zefan Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 62 +++++++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b409df3b2e9d..d71e012e81be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1762,7 +1762,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); * * 'guarantee' is set if the caller promises that a new css_set for the task * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Otherwise, it can only fail with -ESRCH. + * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, bool guarantee) @@ -1800,13 +1800,9 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } put_css_set(oldcg); - /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + /* @tsk can't exit as its threadgroup is locked */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - return -ESRCH; - } + WARN_ON_ONCE(tsk->flags & PF_EXITING); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1832,8 +1828,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * @cgrp: the cgroup the task is attaching to * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'tsk' during call. + * Call with cgroup_mutex and threadgroup locked. May take task_lock of + * @tsk during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1842,6 +1838,10 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + return -ESRCH; + /* Nothing to do if the task is already in that cgroup */ oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) @@ -2062,6 +2062,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) tsk = leader; i = 0; do { + /* @tsk either already exited or can't exit until the end */ + if (tsk->flags & PF_EXITING) + continue; + /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); get_task_struct(tsk); @@ -2116,11 +2120,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) continue; /* get old css_set pointer */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - /* ignore this task if it's going away */ - task_unlock(tsk); - continue; - } oldcg = tsk->cgroups; get_css_set(oldcg); task_unlock(tsk); @@ -2153,16 +2152,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - if (retval == 0) { - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } - } else { - BUG_ON(retval != -ESRCH); + BUG_ON(retval); + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); } } /* nothing is sensitive to fork() after this point. */ @@ -2215,8 +2210,8 @@ out_free_group_list: /* * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will take - * cgroup_mutex; may take task_lock of task. + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup; may take task_lock of task. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -2243,11 +2238,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) * detect it later. */ tsk = tsk->group_leader; - } else if (tsk->flags & PF_EXITING) { - /* optimization for the single-task-only case */ - rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,13 +2261,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) get_task_struct(tsk); } - if (threadgroup) { - threadgroup_lock(tsk); + threadgroup_lock(tsk); + + if (threadgroup) ret = cgroup_attach_proc(cgrp, tsk); - threadgroup_unlock(tsk); - } else { + else ret = cgroup_attach_task(cgrp, tsk); - } + + threadgroup_unlock(tsk); + put_task_struct(tsk); cgroup_unlock(); return ret; -- cgit v1.2.3 From 134d33737f9015761c3832f6b268fae6274aac7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: improve old cgroup handling in cgroup_attach_proc() cgroup_attach_proc() behaves differently from cgroup_attach_task() in the following aspects. * All hooks are invoked even if no task is actually being moved. * ->can_attach_task() is called for all tasks in the group whether the new cgrp is different from the current cgrp or not; however, ->attach_task() is skipped if new equals new. This makes the calls asymmetric. This patch improves old cgroup handling in cgroup_attach_proc() by looking up the current cgroup at the head, recording it in the flex array along with the task itself, and using it to remove the above two differences. This will also ease further changes. -v2: nr_todo renamed to nr_migrating_tasks as per Paul Menage's suggestion. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Acked-by: Li Zefan --- kernel/cgroup.c | 66 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d71e012e81be..0f2d00519d37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1757,6 +1757,11 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +struct task_and_cgroup { + struct task_struct *task; + struct cgroup *cgrp; +}; + /* * cgroup_task_migrate - move a task from one cgroup to another. * @@ -2008,15 +2013,15 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, */ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { - int retval, i, group_size; + int retval, i, group_size, nr_migrating_tasks; struct cgroup_subsys *ss, *failed_ss = NULL; bool cancel_failed_ss = false; /* guaranteed to be initialized later, but the compiler needs this */ - struct cgroup *oldcgrp = NULL; struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; + struct task_and_cgroup *tc; struct flex_array *group; /* * we need to make sure we have css_sets for all the tasks we're @@ -2035,8 +2040,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ group_size = get_nr_threads(leader); /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(struct task_struct *), group_size, - GFP_KERNEL); + group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ @@ -2060,8 +2064,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } /* take a reference on each task in the group to go in the array. */ tsk = leader; - i = 0; + i = nr_migrating_tasks = 0; do { + struct task_and_cgroup ent; + /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) continue; @@ -2073,14 +2079,23 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + ent.task = tsk; + ent.cgrp = task_cgroup_from_root(tsk, root); + retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; + if (ent.cgrp != cgrp) + nr_migrating_tasks++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; read_unlock(&tasklist_lock); + /* methods shouldn't be called if no task is actually migrating */ + retval = 0; + if (!nr_migrating_tasks) + goto out_put_tasks; + /* * step 1: check that we can legitimately attach to the cgroup. */ @@ -2096,8 +2111,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (ss->can_attach_task) { /* run on each task in the threadgroup. */ for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - retval = ss->can_attach_task(cgrp, tsk); + tc = flex_array_get(group, i); + if (tc->cgrp == cgrp) + continue; + retval = ss->can_attach_task(cgrp, tc->task); if (retval) { failed_ss = ss; cancel_failed_ss = true; @@ -2113,18 +2130,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); + tc = flex_array_get(group, i); /* nothing to do if this task is already in the cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) + if (tc->cgrp == cgrp) continue; /* get old css_set pointer */ - task_lock(tsk); - oldcg = tsk->cgroups; + task_lock(tc->task); + oldcg = tc->task->cgroups; get_css_set(oldcg); - task_unlock(tsk); + task_unlock(tc->task); /* see if the new one for us is already in the list? */ - if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) { /* was already there, nothing to do. */ put_css_set(oldcg); } else { @@ -2147,17 +2163,16 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) ss->pre_attach(cgrp); } for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); + tc = flex_array_get(group, i); /* leave current thread as it is if it's already there */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) + if (tc->cgrp == cgrp) continue; - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); BUG_ON(retval); /* attach each task to each subsystem */ for_each_subsys(root, ss) { if (ss->attach_task) - ss->attach_task(cgrp, tsk); + ss->attach_task(cgrp, tc->task); } } /* nothing is sensitive to fork() after this point. */ @@ -2168,8 +2183,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * being moved, this call will need to be reworked to communicate that. */ for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, leader); + if (ss->attach) { + tc = flex_array_get(group, 0); + ss->attach(ss, cgrp, tc->cgrp, tc->task); + } } /* @@ -2198,10 +2215,11 @@ out_cancel_attach: ss->cancel_attach(ss, cgrp, leader); } } +out_put_tasks: /* clean up the array of referenced threads in the group. */ for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - put_task_struct(tsk); + tc = flex_array_get(group, i); + put_task_struct(tc->task); } out_free_group_list: flex_array_free(group); -- cgit v1.2.3 From 2f7ee5691eecb67c8108b92001a85563ea336ac5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: introduce cgroup_taskset and use it in subsys->can_attach(), cancel_attach() and attach() Currently, there's no way to pass multiple tasks to cgroup_subsys methods necessitating the need for separate per-process and per-task methods. This patch introduces cgroup_taskset which can be used to pass multiple tasks and their associated cgroups to cgroup_subsys methods. Three methods - can_attach(), cancel_attach() and attach() - are converted to use cgroup_taskset. This unifies passed parameters so that all methods have access to all information. Conversions in this patchset are identical and don't introduce any behavior change. -v2: documentation updated as per Paul Menage's suggestion. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: James Morris --- kernel/cgroup.c | 99 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 18 +++++---- 3 files changed, 100 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f2d00519d37..41ee01e392e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1757,11 +1757,85 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * Control Group taskset + */ struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; }; +struct cgroup_taskset { + struct task_and_cgroup single; + struct flex_array *tc_array; + int tc_array_len; + int idx; + struct cgroup *cur_cgrp; +}; + +/** + * cgroup_taskset_first - reset taskset and return the first task + * @tset: taskset of interest + * + * @tset iteration is initialized and the first task is returned. + */ +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +{ + if (tset->tc_array) { + tset->idx = 0; + return cgroup_taskset_next(tset); + } else { + tset->cur_cgrp = tset->single.cgrp; + return tset->single.task; + } +} +EXPORT_SYMBOL_GPL(cgroup_taskset_first); + +/** + * cgroup_taskset_next - iterate to the next task in taskset + * @tset: taskset of interest + * + * Return the next task in @tset. Iteration must have been initialized + * with cgroup_taskset_first(). + */ +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +{ + struct task_and_cgroup *tc; + + if (!tset->tc_array || tset->idx >= tset->tc_array_len) + return NULL; + + tc = flex_array_get(tset->tc_array, tset->idx++); + tset->cur_cgrp = tc->cgrp; + return tc->task; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_next); + +/** + * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * @tset: taskset of interest + * + * Return the cgroup for the current (last returned) task of @tset. This + * function must be preceded by either cgroup_taskset_first() or + * cgroup_taskset_next(). + */ +struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +{ + return tset->cur_cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); + +/** + * cgroup_taskset_size - return the number of tasks in taskset + * @tset: taskset of interest + */ +int cgroup_taskset_size(struct cgroup_taskset *tset) +{ + return tset->tc_array ? tset->tc_array_len : 1; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_size); + + /* * cgroup_task_migrate - move a task from one cgroup to another. * @@ -1842,6 +1916,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; + struct cgroup_taskset tset = { }; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1852,9 +1927,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (cgrp == oldcgrp) return 0; + tset.single.task = tsk; + tset.single.cgrp = oldcgrp; + for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1885,7 +1963,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) if (ss->attach_task) ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, &tset); } synchronize_rcu(); @@ -1907,7 +1985,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk); + ss->cancel_attach(ss, cgrp, &tset); } } return retval; @@ -2023,6 +2101,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; + struct cgroup_taskset tset = { }; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2089,6 +2168,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; + tset.tc_array = group; + tset.tc_array_len = group_size; read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ @@ -2101,7 +2182,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, leader); + retval = ss->can_attach(ss, cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2183,10 +2264,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * being moved, this call will need to be reworked to communicate that. */ for_each_subsys(root, ss) { - if (ss->attach) { - tc = flex_array_get(group, 0); - ss->attach(ss, cgrp, tc->cgrp, tc->task); - } + if (ss->attach) + ss->attach(ss, cgrp, &tset); } /* @@ -2208,11 +2287,11 @@ out_cancel_attach: for_each_subsys(root, ss) { if (ss == failed_ss) { if (cancel_failed_ss && ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + ss->cancel_attach(ss, cgrp, &tset); break; } if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, leader); + ss->cancel_attach(ss, cgrp, &tset); } } out_put_tasks: diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e411a60cc2c8..e95c6fb65cc0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct cgroup_taskset *tset) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a426..512bd59e8627 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1371,10 +1371,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -1387,7 +1387,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may * be changed. */ - if (tsk->flags & PF_THREAD_BOUND) + if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND) return -EINVAL; return 0; @@ -1437,12 +1437,14 @@ static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) cpuset_update_task_spread_flag(cs, tsk); } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { struct mm_struct *mm; - struct cpuset *cs = cgroup_cs(cont); - struct cpuset *oldcs = cgroup_cs(oldcont); + struct task_struct *tsk = cgroup_taskset_first(tset); + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *oldcs = cgroup_cs(oldcgrp); /* * Change mm, possibly for multiple threads in a threadgroup. This is -- cgit v1.2.3 From bb9d97b6dffa10cec5e1ce9adbce60f3c2b5eabc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:21 -0800 Subject: cgroup: don't use subsys->can_attach_task() or ->attach_task() Now that subsys->can_attach() and attach() take @tset instead of @task, they can handle per-task operations. Convert ->can_attach_task() and ->attach_task() users to use ->can_attach() and attach() instead. Most converions are straight-forward. Noteworthy changes are, * In cgroup_freezer, remove unnecessary NULL assignments to unused methods. It's useless and very prone to get out of sync, which already happened. * In cpuset, PF_THREAD_BOUND test is checked for each task. This doesn't make any practical difference but is conceptually cleaner. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Li Zefan Cc: Paul Menage Cc: Balbir Singh Cc: Daisuke Nishimura Cc: James Morris Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/cgroup_freezer.c | 14 +++------- kernel/cpuset.c | 70 ++++++++++++++++++++++--------------------------- kernel/events/core.c | 13 +++++---- kernel/sched.c | 31 +++++++++++++--------- 4 files changed, 63 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e95c6fb65cc0..0e748059ba87 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -162,10 +162,14 @@ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup_taskset *tset) { struct freezer *freezer; + struct task_struct *task; /* * Anything frozen can't move or be moved to/from. */ + cgroup_taskset_for_each(task, new_cgroup, tset) + if (cgroup_freezing(task)) + return -EBUSY; freezer = cgroup_freezer(new_cgroup); if (freezer->state != CGROUP_THAWED) @@ -174,11 +178,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - return cgroup_freezing(tsk) ? -EBUSY : 0; -} - static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; @@ -374,10 +373,5 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, - .can_attach_task = freezer_can_attach_task, - .pre_attach = NULL, - .attach_task = NULL, - .attach = NULL, .fork = freezer_fork, - .exit = NULL, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 512bd59e8627..9a8a61301524 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1375,33 +1375,34 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); + struct task_struct *task; + int ret; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - /* - * Kthreads bound to specific cpus cannot be moved to a new cpuset; we - * cannot change their cpu affinity and isolating such threads by their - * set of allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may - * be changed. - */ - if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND) - return -EINVAL; - + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * Kthreads bound to specific cpus cannot be moved to a new + * cpuset; we cannot change their cpu affinity and + * isolating such threads by their set of allowed nodes is + * unnecessary. Thus, cpusets are not applicable for such + * threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before + * cpus_allowed may be changed. + */ + if (task->flags & PF_THREAD_BOUND) + return -EINVAL; + if ((ret = security_task_setscheduler(task))) + return ret; + } return 0; } -static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) -{ - return security_task_setscheduler(task); -} - /* * Protected by cgroup_lock. The nodemasks must be stored globally because * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, attach_task, and attach. + * persist among pre_attach, and attach. */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_from; @@ -1420,39 +1421,34 @@ static void cpuset_pre_attach(struct cgroup *cont) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); } -/* Per-thread attachment work. */ -static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) -{ - int err; - struct cpuset *cs = cgroup_cs(cont); - - /* - * can_attach beforehand should guarantee that this doesn't fail. - * TODO: have a better way to handle failure here - */ - err = set_cpus_allowed_ptr(tsk, cpus_attach); - WARN_ON_ONCE(err); - - cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, tsk); -} - static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; - struct task_struct *tsk = cgroup_taskset_first(tset); + struct task_struct *task; + struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + cgroup_taskset_for_each(task, cgrp, tset) { + /* + * can_attach beforehand should guarantee that this doesn't + * fail. TODO: have a better way to handle failure here + */ + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flag(cs, task); + } + /* * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; - mm = get_task_mm(tsk); + mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) @@ -1908,9 +1904,7 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .can_attach_task = cpuset_can_attach_task, .pre_attach = cpuset_pre_attach, - .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..3b8e0edbe693 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7044,10 +7044,13 @@ static int __perf_cgroup_move(void *info) return 0; } -static void -perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - task_function_call(task, __perf_cgroup_move, task); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + task_function_call(task, __perf_cgroup_move, task); } static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -7061,7 +7064,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_attach_task(cgrp, task); + task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_subsys = { @@ -7070,6 +7073,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach_task = perf_cgroup_attach_task, + .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..161184da7b81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9127,24 +9127,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) sched_destroy_group(tg); } -static int -cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) - return -EINVAL; + if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + return -EINVAL; #else - /* We don't support RT-tasks being in separate groups */ - if (tsk->sched_class != &fair_sched_class) - return -EINVAL; + /* We don't support RT-tasks being in separate groups */ + if (task->sched_class != &fair_sched_class) + return -EINVAL; #endif + } return 0; } -static void -cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup_taskset *tset) { - sched_move_task(tsk); + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + sched_move_task(task); } static void @@ -9480,8 +9487,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach_task = cpu_cgroup_can_attach_task, - .attach_task = cpu_cgroup_attach_task, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, -- cgit v1.2.3 From 94196f51c1ee5bbad674de28c682b17d78adb8e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:22 -0800 Subject: cgroup, cpuset: don't use ss->pre_attach() ->pre_attach() is supposed to be called before migration, which is observed during process migration but task migration does it the other way around. The only ->pre_attach() user is cpuset which can do the same operaitons in ->can_attach(). Collapse cpuset_pre_attach() into cpuset_can_attach(). -v2: Patch contamination from later patch removed. Spotted by Paul Menage. Signed-off-by: Tejun Heo Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Cc: Li Zefan --- kernel/cpuset.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9a8a61301524..42e568306382 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1370,6 +1370,15 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in can_attach, and they must + * persist until attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup_taskset *tset) @@ -1396,29 +1405,16 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if ((ret = security_task_setscheduler(task))) return ret; } - return 0; -} - -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, and attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Set-up work for before attaching each task. */ -static void cpuset_pre_attach(struct cgroup *cont) -{ - struct cpuset *cs = cgroup_cs(cont); + /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + + return 0; } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -1904,7 +1900,6 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, - .pre_attach = cpuset_pre_attach, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, -- cgit v1.2.3 From 494c167cf76d02000adf740c215adc69a824ecc9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 Dec 2011 18:12:22 -0800 Subject: cgroup: kill subsys->can_attach_task(), pre_attach() and attach_task() These three methods are no longer used. Kill them. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Frederic Weisbecker Acked-by: Paul Menage Cc: Li Zefan --- kernel/cgroup.c | 52 +++++----------------------------------------------- 1 file changed, 5 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 41ee01e392e6..1b3b84174ead 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1944,13 +1944,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } - if (ss->can_attach_task) { - retval = ss->can_attach_task(cgrp, tsk); - if (retval) { - failed_ss = ss; - goto out; - } - } } retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); @@ -1958,10 +1951,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - if (ss->attach_task) - ss->attach_task(cgrp, tsk); if (ss->attach) ss->attach(ss, cgrp, &tset); } @@ -2093,7 +2082,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { int retval, i, group_size, nr_migrating_tasks; struct cgroup_subsys *ss, *failed_ss = NULL; - bool cancel_failed_ss = false; /* guaranteed to be initialized later, but the compiler needs this */ struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; @@ -2188,21 +2176,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_cancel_attach; } } - /* a callback to be run on every thread in the threadgroup. */ - if (ss->can_attach_task) { - /* run on each task in the threadgroup. */ - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - if (tc->cgrp == cgrp) - continue; - retval = ss->can_attach_task(cgrp, tc->task); - if (retval) { - failed_ss = ss; - cancel_failed_ss = true; - goto out_cancel_attach; - } - } - } } /* @@ -2234,15 +2207,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } /* - * step 3: now that we're guaranteed success wrt the css_sets, proceed - * to move all tasks to the new cgroup, calling ss->attach_task for each - * one along the way. there are no failure cases after here, so this is - * the commit point. + * step 3: now that we're guaranteed success wrt the css_sets, + * proceed to move all tasks to the new cgroup. There are no + * failure cases after here, so this is the commit point. */ - for_each_subsys(root, ss) { - if (ss->pre_attach) - ss->pre_attach(cgrp); - } for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); /* leave current thread as it is if it's already there */ @@ -2250,18 +2218,11 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) continue; retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); BUG_ON(retval); - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tc->task); - } } /* nothing is sensitive to fork() after this point. */ /* - * step 4: do expensive, non-thread-specific subsystem callbacks. - * TODO: if ever a subsystem needs to know the oldcgrp for each task - * being moved, this call will need to be reworked to communicate that. + * step 4: do subsystem attach callbacks. */ for_each_subsys(root, ss) { if (ss->attach) @@ -2285,11 +2246,8 @@ out_cancel_attach: /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { - if (ss == failed_ss) { - if (cancel_failed_ss && ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + if (ss == failed_ss) break; - } if (ss->cancel_attach) ss->cancel_attach(ss, cgrp, &tset); } -- cgit v1.2.3 From 52dcf8a1f8ac09b6ea21266ebdc4db6d52eea1fc Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 5 Dec 2011 22:13:41 +0100 Subject: resource cgroups: remove bogus cast The memparse() function already accepts const char * as the parsing string. Signed-off-by: Davidlohr Bueso Acked-by: Pavel Emelyanov Signed-off-by: Tejun Heo --- kernel/res_counter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cce..6d269cce7aa1 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf, return 0; } - /* FIXME - make memparse() take const char* args */ - *res = memparse((char *)buf, &end); + *res = memparse(buf, &end); if (*end != '\0') return -EINVAL; -- cgit v1.2.3 From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Dec 2011 00:33:38 +0100 Subject: block: make ioc get/put interface more conventional and fix race on alloction Ignoring copy_io() during fork, io_context can be allocated from two places - current_io_context() and set_task_ioprio(). The former is always called from local task while the latter can be called from different task. The synchornization between them are peculiar and dubious. * current_io_context() doesn't grab task_lock() and assumes that if it saw %NULL ->io_context, it would stay that way until allocation and assignment is complete. It has smp_wmb() between alloc/init and assignment. * set_task_ioprio() grabs task_lock() for assignment and does smp_read_barrier_depends() between "ioc = task->io_context" and "if (ioc)". Unfortunately, this doesn't achieve anything - the latter is not a dependent load of the former. ie, if ioc itself were being dereferenced "ioc->xxx", it would mean something (not sure what tho) but as the code currently stands, the dependent read barrier is noop. As only one of the the two test-assignment sequences is task_lock() protected, the task_lock() can't do much about race between the two. Nothing prevents current_io_context() and set_task_ioprio() allocating its own ioc for the same task and overwriting the other's. Also, set_task_ioprio() can race with exiting task and create a new ioc after exit_io_context() is finished. ioc get/put doesn't have any reason to be complex. The only hot path is accessing the existing ioc of %current, which is simple to achieve given that ->io_context is never destroyed as long as the task is alive. All other paths can happily go through task_lock() like all other task sub structures without impacting anything. This patch updates ioc get/put so that it becomes more conventional. * alloc_io_context() is replaced with get_task_io_context(). This is the only interface which can acquire access to ioc of another task. On return, the caller has an explicit reference to the object which should be put using put_io_context() afterwards. * The functionality of current_io_context() remains the same but when creating a new ioc, it shares the code path with get_task_io_context() and always goes through task_lock(). * get_io_context() now means incrementing ref on an ioc which the caller already has access to (be that an explicit refcnt or implicit %current one). * PF_EXITING inhibits creation of new io_context and once exit_io_context() is finished, it's guaranteed that both ioc acquisition functions return %NULL. * All users are updated. Most are trivial but smp_read_barrier_depends() removal from cfq_get_io_context() needs a bit of explanation. I suppose the original intention was to ensure ioc->ioprio is visible when set_task_ioprio() allocates new io_context and installs it; however, this wouldn't have worked because set_task_ioprio() doesn't have wmb between init and install. There are other problems with this which will be fixed in another patch. * While at it, use NUMA_NO_NODE instead of -1 for wildcard node specification. -v2: Vivek spotted contamination from debug patch. Removed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- kernel/fork.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..5bcfc739bb7c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) { #ifdef CONFIG_BLOCK struct io_context *ioc = current->io_context; + struct io_context *new_ioc; if (!ioc) return 0; @@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) if (unlikely(!tsk->io_context)) return -ENOMEM; } else if (ioprio_valid(ioc->ioprio)) { - tsk->io_context = alloc_io_context(GFP_KERNEL, -1); - if (unlikely(!tsk->io_context)) + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!new_ioc)) return -ENOMEM; - tsk->io_context->ioprio = ioc->ioprio; + new_ioc->ioprio = ioc->ioprio; + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3 From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Dec 2011 00:33:39 +0100 Subject: block, cfq: unlink cfq_io_context's immediately cic is association between io_context and request_queue. A cic is linked from both ioc and q and should be destroyed when either one goes away. As ioc and q both have their own locks, locking becomes a bit complex - both orders work for removal from one but not from the other. Currently, cfq tries to circumvent this locking order issue with RCU. ioc->lock nests inside queue_lock but the radix tree and cic's are also protected by RCU allowing either side to walk their lists without grabbing lock. This rather unconventional use of RCU quickly devolves into extremely fragile convolution. e.g. The following is from cfqd going away too soon after ioc and q exits raced. general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: [ 88.503444] Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs RIP: 0010:[] [] cfq_exit_single_io_context+0x58/0xf0 ... Call Trace: [] call_for_each_cic+0x5a/0x90 [] cfq_exit_io_context+0x15/0x20 [] exit_io_context+0x100/0x140 [] do_exit+0x579/0x850 [] do_group_exit+0x5b/0xd0 [] sys_exit_group+0x17/0x20 [] system_call_fastpath+0x16/0x1b The only real hot path here is cic lookup during request initialization and avoiding extra locking requires very confined use of RCU. This patch makes cic removal from both ioc and request_queue perform double-locking and unlink immediately. * From q side, the change is almost trivial as ioc->lock nests inside queue_lock. It just needs to grab each ioc->lock as it walks cic_list and unlink it. * From ioc side, it's a bit more difficult because of inversed lock order. ioc needs its lock to walk its cic_list but can't grab the matching queue_lock and needs to perform unlock-relock dancing. Unlinking is now wholly done from put_io_context() and fast path is optimized by using the queue_lock the caller already holds, which is by far the most common case. If the ioc accessed multiple devices, it tries with trylock. In unlikely cases of fast path failure, it falls back to full double-locking dance from workqueue. Double-locking isn't the prettiest thing in the world but it's *far* simpler and more understandable than RCU trick without adding any meaningful overhead. This still leaves a lot of now unnecessary RCU logics. Future patches will trim them. -v2: Vivek pointed out that cic->q was being dereferenced after cic->release() was called. Updated to use local variable @this_q instead. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5bcfc739bb7c..2753449f2038 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); + put_io_context(new_ioc, NULL); } #endif return 0; -- cgit v1.2.3 From 997d3eaf02cad6fdb54bd6085c9a7c48ddd68a2d Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 14 Dec 2011 14:54:22 -0800 Subject: rtmutex-tester: convert sysdev_class to a regular subsystem After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Cc: Thomas Gleixner Cc: Paul Gortmaker Cc: Arnd Bergmann Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/rtmutex-tester.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 3d9f31cd79e7..98ec49475460 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,11 +6,11 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner * */ +#include #include #include #include #include -#include #include #include @@ -27,7 +27,7 @@ struct test_thread_data { int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; int event; - struct sys_device sysdev; + struct device dev; }; static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; @@ -271,7 +271,7 @@ static int test_func(void *data) * * opcode:data */ -static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct sched_param schedpar; @@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut char cmdbuf[32]; int op, dat, tid, ret; - td = container_of(dev, struct test_thread_data, sysdev); - tid = td->sysdev.id; + td = container_of(dev, struct test_thread_data, dev); + tid = td->dev.id; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(cmdbuf)) @@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut * @dev: thread to query * @buf: char buffer to be filled with thread status info */ -static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, char *buf) { struct test_thread_data *td; @@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute char *curr = buf; int i; - td = container_of(dev, struct test_thread_data, sysdev); - tsk = threads[td->sysdev.id]; + td = container_of(dev, struct test_thread_data, dev); + tsk = threads[td->dev.id]; spin_lock(&rttest_lock); @@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_unlock(&rttest_lock); curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->sysdev.id].owner); + mutexes[td->dev.id].owner); return curr - buf; } -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); +static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); -static struct sysdev_class rttest_sysclass = { +static struct bus_type rttest_subsys = { .name = "rttest", + .dev_name = "rttest", }; static int init_test_thread(int id) { - thread_data[id].sysdev.cls = &rttest_sysclass; - thread_data[id].sysdev.id = id; + thread_data[id].dev.bus = &rttest_subsys; + thread_data[id].dev.id = id; threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); if (IS_ERR(threads[id])) return PTR_ERR(threads[id]); - return sysdev_register(&thread_data[id].sysdev); + return device_register(&thread_data[id].dev); } static int init_rttest(void) @@ -393,7 +394,7 @@ static int init_rttest(void) for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) rt_mutex_init(&mutexes[i]); - ret = sysdev_class_register(&rttest_sysclass); + ret = subsys_system_register(&rttest_subsys, NULL); if (ret) return ret; @@ -401,10 +402,10 @@ static int init_rttest(void) ret = init_test_thread(i); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + ret = device_create_file(&thread_data[i].dev, &dev_attr_status); if (ret) break; - ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + ret = device_create_file(&thread_data[i].dev, &dev_attr_command); if (ret) break; } -- cgit v1.2.3 From d369a5d8fc70710236ae2d06a0e42dce483712df Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 14 Dec 2011 15:28:51 -0800 Subject: clocksource: convert sysdev_class to a regular subsystem After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/time/clocksource.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..3f5c8512c033 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,8 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#include #include -#include #include #include #include /* for spin_unlock_irq() using preempt_count() m68k */ @@ -754,8 +754,8 @@ EXPORT_SYMBOL(clocksource_unregister); * Provides sysfs interface for listing current clocksource. */ static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { ssize_t count = 0; @@ -775,8 +775,8 @@ sysfs_show_current_clocksources(struct sys_device *dev, * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { size_t ret = count; @@ -809,8 +809,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, * Provides sysfs interface for listing registered clocksources */ static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, - struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) { struct clocksource *src; @@ -839,35 +839,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, /* * Sysfs setup bits: */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = { .name = "clocksource", + .dev_name = "clocksource", }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = { .id = 0, - .cls = &clocksource_sysclass, + .bus = &clocksource_subsys, }; static int __init init_clocksource_sysfs(void) { - int error = sysdev_class_register(&clocksource_sysclass); + int error = subsys_system_register(&clocksource_subsys, NULL); if (!error) - error = sysdev_register(&device_clocksource); + error = device_register(&device_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_current_clocksource); + &dev_attr_current_clocksource); if (!error) - error = sysdev_create_file( + error = device_create_file( &device_clocksource, - &attr_available_clocksource); + &dev_attr_available_clocksource); return error; } -- cgit v1.2.3 From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 5 Apr 2011 13:21:19 -0600 Subject: writeback: charge leaked page dirties to active tasks It's a years long problem that a large number of short-lived dirtiers (eg. gcc instances in a fast kernel build) may starve long-run dirtiers (eg. dd) as well as pushing the dirty pages to the global hard limit. The solution is to charge the pages dirtied by the exited gcc to the other random dirtying tasks. It sounds not perfect, however should behave good enough in practice, seeing as that throttled tasks aren't actually running so those that are running are more likely to pick it up and get throttled, therefore promoting an equal spread. Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Randy Dunlap Signed-off-by: Wu Fengguang --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..d4aac24cc469 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; -- cgit v1.2.3 From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 19:25:42 -0600 Subject: writeback: dirty ratelimit - think time compensation Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..f8668cf6a32d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. -- cgit v1.2.3 From 29e21368b9baf9c4b25060d65062da2dda926c70 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Dec 2011 14:21:26 -0800 Subject: cgroups: remove redundant get/put of css_set from css_set_check_fetched() We already have a reference to all elements in newcg_list. Signed-off-by: Mandeep Singh Baines Reviewed-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: Paul Menage --- kernel/cgroup.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1b3b84174ead..bc3caff138d8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2025,23 +2025,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp, read_lock(&css_set_lock); newcg = find_existing_css_set(cg, cgrp, template); - if (newcg) - get_css_set(newcg); read_unlock(&css_set_lock); /* doesn't exist at all? */ if (!newcg) return false; /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) { - if (cg_entry->cg == newcg) { - put_css_set(newcg); + list_for_each_entry(cg_entry, newcg_list, links) + if (cg_entry->cg == newcg) return true; - } - } /* not found */ - put_css_set(newcg); return false; } -- cgit v1.2.3 From 30fb6aa74011dcf595f306ca2727254d708b786e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 5 Dec 2011 18:22:48 +0100 Subject: ftrace: Fix unregister ftrace_ops accounting Multiple users of the function tracer can register their functions with the ftrace_ops structure. The accounting within ftrace will update the counter on each function record that is being traced. When the ftrace_ops filtering adds or removes functions, the function records will be updated accordingly if the ftrace_ops is still registered. When a ftrace_ops is removed, the counter of the function records, that the ftrace_ops traces, are decremented. When they reach zero the functions that they represent are modified to stop calling the mcount code. When changes are made, the code is updated via stop_machine() with a command passed to the function to tell it what to do. There is an ENABLE and DISABLE command that tells the called function to enable or disable the functions. But the ENABLE is really a misnomer as it should just update the records, as records that have been enabled and now have a count of zero should be disabled. The DISABLE command is used to disable all functions regardless of their counter values. This is the big off switch and is not the complement of the ENABLE command. To make matters worse, when a ftrace_ops is unregistered and there is another ftrace_ops registered, neither the DISABLE nor the ENABLE command are set when calling into the stop_machine() function and the records will not be updated to match their counter. A command is passed to that function that will update the mcount code to call the registered callback directly if it is the only one left. This means that the ftrace_ops that is still registered will have its callback called by all functions that have been set for it as well as the ftrace_ops that was just unregistered. Here's a way to trigger this bug. Compile the kernel with CONFIG_FUNCTION_PROFILER set and with CONFIG_FUNCTION_GRAPH not set: CONFIG_FUNCTION_PROFILER=y # CONFIG_FUNCTION_GRAPH is not set This will force the function profiler to use the function tracer instead of the function graph tracer. # cd /sys/kernel/debug/tracing # echo schedule > set_ftrace_filter # echo function > current_tracer # cat set_ftrace_filter schedule # cat trace # tracer: nop # # entries-in-buffer/entries-written: 692/68108025 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | kworker/0:2-909 [000] .... 531.235574: schedule <-worker_thread -0 [001] .N.. 531.235575: schedule <-cpu_idle kworker/0:2-909 [000] .... 531.235597: schedule <-worker_thread sshd-2563 [001] .... 531.235647: schedule <-schedule_hrtimeout_range_clock # echo 1 > function_profile_enabled # echo 0 > function_porfile_enabled # cat set_ftrace_filter schedule # cat trace # tracer: function # # entries-in-buffer/entries-written: 159701/118821262 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | -0 [002] ...1 604.870655: local_touch_nmi <-cpu_idle -0 [002] d..1 604.870655: enter_idle <-cpu_idle -0 [002] d..1 604.870656: atomic_notifier_call_chain <-enter_idle -0 [002] d..1 604.870656: __atomic_notifier_call_chain <-atomic_notifier_call_chain The same problem could have happened with the trace_probe_ops, but they are modified with the set_frace_filter file which does the update at closure of the file. The simple solution is to change ENABLE to UPDATE and call it every time an ftrace_ops is unregistered. Link: http://lkml.kernel.org/r/1323105776-26961-3-git-send-email-jolsa@redhat.com Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1e8943fed1d..25b4f4da0fe8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -948,7 +948,7 @@ struct ftrace_func_probe { }; enum { - FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_START_FUNC_RET = (1 << 3), @@ -1519,7 +1519,7 @@ int ftrace_text_reserved(void *start, void *end) static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int update) { unsigned long ftrace_addr; unsigned long flag = 0UL; @@ -1527,17 +1527,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If we are enabling tracing: + * If we are updating calls: * * If the record has a ref count, then we need to enable it * because someone is using it. * * Otherwise we make sure its disabled. * - * If we are disabling tracing, then disable all records that + * If we are disabling calls, then disable all records that * are enabled. */ - if (enable && (rec->flags & ~FTRACE_FL_MASK)) + if (update && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ @@ -1553,7 +1553,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_nop(NULL, rec, ftrace_addr); } -static void ftrace_replace_code(int enable) +static void ftrace_replace_code(int update) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1567,7 +1567,7 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FREE) continue; - failed = __ftrace_replace_code(rec, enable); + failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1623,7 +1623,7 @@ static int __ftrace_modify_code(void *data) */ function_trace_stop++; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); @@ -1691,7 +1691,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return -ENODEV; ftrace_start_up++; - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; /* ops marked global share the filter hashes */ if (ops->flags & FTRACE_OPS_FL_GLOBAL) { @@ -1743,8 +1743,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - if (!ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; + command |= FTRACE_UPDATE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -1766,7 +1765,7 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); } static void ftrace_shutdown_sysctl(void) @@ -2919,7 +2918,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); @@ -3107,7 +3106,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash, iter->hash); if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + ftrace_run_update_code(FTRACE_UPDATE_CALLS); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From c88fd8634ea68e74c7d19fd2621b4078fd22864c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Aug 2011 09:53:39 -0400 Subject: ftrace: Allow archs to modify code without stop machine The stop machine method to modify all functions in the kernel (some 20,000 of them) is the safest way to do so across all archs. But some archs may not need this big hammer approach to modify code on SMP machines, and can simply just update the code it needs. Adding a weak function arch_ftrace_update_code() that now does the stop machine, will also let any arch override this method. If the arch needs to check the system and then decide if it can avoid stop machine, it can still call ftrace_run_stop_machine() to use the old method. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 253 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 215 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25b4f4da0fe8..655b432fb890 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -947,13 +947,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; -enum { - FTRACE_UPDATE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_START_FUNC_RET = (1 << 3), - FTRACE_STOP_FUNC_RET = (1 << 4), -}; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; @@ -1307,6 +1300,28 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns 1 if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_location(unsigned long ip) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip == ip) + return 1; + } while_for_each_ftrace_rec(); + + return 0; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1475,7 +1490,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -1517,15 +1544,10 @@ int ftrace_text_reserved(void *start, void *end) return 0; } - -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { - unsigned long ftrace_addr; unsigned long flag = 0UL; - ftrace_addr = (unsigned long)FTRACE_ADDR; - /* * If we are updating calls: * @@ -1537,20 +1559,74 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int update) * If we are disabling calls, then disable all records that * are enabled. */ - if (update && (rec->flags & ~FTRACE_FL_MASK)) + if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) - return 0; + return FTRACE_UPDATE_IGNORE; if (flag) { - rec->flags |= FTRACE_FL_ENABLED; + if (update) + rec->flags |= FTRACE_FL_ENABLED; + return FTRACE_UPDATE_MAKE_CALL; + } + + if (update) + rec->flags &= ~FTRACE_FL_ENABLED; + + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 1); +} + +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ + return ftrace_check_record(rec, enable, 0); +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); } - rec->flags &= ~FTRACE_FL_ENABLED; - return ftrace_make_nop(NULL, rec, ftrace_addr); + return -1; /* unknow ftrace bug */ } static void ftrace_replace_code(int update) @@ -1576,6 +1652,78 @@ static void ftrace_replace_code(int update) } while_for_each_ftrace_rec(); } +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { @@ -1617,12 +1765,6 @@ static int __ftrace_modify_code(void *data) { int *command = data; - /* - * Do not call function tracer while we update the code. - * We are in stop machine, no worrying about races. - */ - function_trace_stop++; - if (*command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1636,21 +1778,33 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif - function_trace_stop--; - return 0; } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + static void ftrace_run_update_code(int command) { int ret; @@ -1659,8 +1813,31 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); if (ret) return; + /* + * Do not call function tracer while we update the code. + * We are in stop machine. + */ + function_trace_stop++; - stop_machine(__ftrace_modify_code, &command, NULL); + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); -- cgit v1.2.3 From 3208230983a0ee3d95be22d463257e530c684956 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 14:42:37 -0500 Subject: ftrace: Remove usage of "freed" records Records that are added to the function trace table are permanently there, except for modules. By separating out the modules to their own pages that can be freed in one shot we can remove the "freed" flag and simplify some of the record management. Another benefit of doing this is that we can also move the records around; sort them. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 655b432fb890..be6888f40d2b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -996,8 +996,6 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; - static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1421,32 +1419,8 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static void ftrace_free_rec(struct dyn_ftrace *rec) -{ - rec->freelist = ftrace_free_records; - ftrace_free_records = rec; - rec->flags |= FTRACE_FL_FREE; -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - struct dyn_ftrace *rec; - - /* First check for freed records */ - if (ftrace_free_records) { - rec = ftrace_free_records; - - if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { - FTRACE_WARN_ON_ONCE(1); - ftrace_free_records = NULL; - return NULL; - } - - ftrace_free_records = rec->freelist; - memset(rec, 0, sizeof(*rec)); - return rec; - } - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) { /* allocate another page */ @@ -1639,10 +1613,6 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - /* Skip over free records */ - if (rec->flags & FTRACE_FL_FREE) - continue; - failed = __ftrace_replace_code(rec, update); if (failed) { ftrace_bug(failed, rec->ip); @@ -2007,11 +1977,8 @@ static int ftrace_update_code(struct module *mod) * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) { - ftrace_free_rec(p); - /* Game over */ + if (!ftrace_code_disable(mod, p)) break; - } ftrace_update_cnt++; @@ -2026,10 +1993,8 @@ static int ftrace_update_code(struct module *mod) */ if (ftrace_start_up && ref) { int failed = __ftrace_replace_code(p, 1); - if (failed) { + if (failed) ftrace_bug(failed, p->ip); - ftrace_free_rec(p); - } } } @@ -2223,9 +2188,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FREE) || - - ((iter->flags & FTRACE_ITER_FILTER) && + if (((iter->flags & FTRACE_ITER_FILTER) && !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && @@ -2602,7 +2565,6 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { ret = enter_record(hash, rec, not); if (ret < 0) { @@ -3446,9 +3408,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FREE) - continue; - if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; @@ -3566,6 +3525,27 @@ static int ftrace_process_locs(struct module *mod, unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (mod) { + if (!ftrace_pages) + return -ENOMEM; + + /* + * If the last page was full, it will be + * allocated anyway. + */ + if (ftrace_pages->index != ENTRIES_PER_PAGE) { + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return -ENOMEM; + ftrace_pages = ftrace_pages->next; + } + } + p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -3599,9 +3579,13 @@ static int ftrace_process_locs(struct module *mod, } #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; + struct ftrace_page **last_pg; struct ftrace_page *pg; mutex_lock(&ftrace_lock); @@ -3609,16 +3593,30 @@ void ftrace_release_mod(struct module *mod) if (ftrace_disabled) goto out_unlock; - do_for_each_ftrace_rec(pg, rec) { + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; if (within_module_core(rec->ip, mod)) { /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. + * As core pages are first, the first + * page should never be a module page. */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + *last_pg = pg->next; + free_page((unsigned long)pg); + } else + last_pg = &pg->next; + } out_unlock: mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From a79008755497daff157f5294c02e3b940641cc11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:23:44 -0500 Subject: ftrace: Allocate the mcount record pages as groups Allocate the mcount record pages as a group of pages as big as can be allocated and waste no more than a single page. Grouping the mcount pages as much as possible helps with cache locality, as we do not need to redirect with descriptors as we cross from page to page. It also allows us to do more with the records later on (sort them with bigger benefits). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index be6888f40d2b..2e7218869fe9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -983,12 +983,13 @@ static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; + struct dyn_ftrace *records; int index; - struct dyn_ftrace records[]; + int size; }; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) /* estimate from running different kernels */ #define NR_TO_INIT 10000 @@ -1421,14 +1422,10 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) { - /* allocate another page */ - ftrace_pages->next = - (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return NULL; - } + if (ftrace_pages->index == ftrace_pages->size) { + /* We should have allocated enough */ + if (WARN_ON(!ftrace_pages->next)) + return NULL; ftrace_pages = ftrace_pages->next; } @@ -2005,47 +2002,106 @@ static int ftrace_update_code(struct module *mod) return 0; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count) { - struct ftrace_page *pg; + int order; int cnt; - int i; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; + if (WARN_ON(!count)) + return -EINVAL; + + order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. + * We want to fill as much as possible. No more than a page + * may be empty. */ + while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + order--; - pg = ftrace_pages = ftrace_pages_start; + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order >>= 1; + goto again; + } - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->size = cnt; - /* If we fail, we'll try later anyway */ - if (!pg->next) + if (cnt > count) + cnt = count; + + return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int order; + int cnt; + + if (!num_to_init) + return 0; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) break; + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + pg = pg->next; } + return start_pg; + + free_pages: + while (start_pg) { + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + start_pg = pg->next; + kfree(pg); + pg = start_pg; + } + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; +} + +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + int cnt; + + if (!num_to_init) { + pr_info("ftrace: No functions to be traced?\n"); + return -1; + } + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + return 0; } @@ -3520,30 +3576,45 @@ static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *pg; + unsigned long count; unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + pg = ftrace_allocate_pages(count); + if (!pg) + return -ENOMEM; mutex_lock(&ftrace_lock); + /* * Core and each module needs their own pages, as * modules will free them when they are removed. * Force a new page to be allocated for modules. */ - if (mod) { + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = pg; + } else { if (!ftrace_pages) - return -ENOMEM; + goto out; - /* - * If the last page was full, it will be - * allocated anyway. - */ - if (ftrace_pages->index != ENTRIES_PER_PAGE) { - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages->next) - return -ENOMEM; - ftrace_pages = ftrace_pages->next; + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; } + + ftrace_pages->next = pg; + ftrace_pages = pg; } p = start; @@ -3557,7 +3628,8 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - ftrace_record_ip(addr); + if (!ftrace_record_ip(addr)) + break; } /* @@ -3573,9 +3645,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_update_code(mod); if (!mod) local_irq_restore(flags); + ret = 0; + out: mutex_unlock(&ftrace_lock); - return 0; + return ret; } #ifdef CONFIG_MODULES @@ -3587,6 +3661,7 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page **last_pg; struct ftrace_page *pg; + int order; mutex_lock(&ftrace_lock); @@ -3613,7 +3688,9 @@ void ftrace_release_mod(struct module *mod) ftrace_pages = next_to_ftrace_page(last_pg); *last_pg = pg->next; - free_page((unsigned long)pg); + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); } else last_pg = &pg->next; } -- cgit v1.2.3 From 85ae32ae019bc1c2cc22e5f51fe0c9f2812ef68c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 16:30:31 -0500 Subject: ftrace: Replace record newlist with record page list As new functions come in to be initalized from mcount to nop, they are done by groups of pages. Whether it is the core kernel or a module. There's no need to keep track of these on a per record basis. At startup, and as any module is loaded, the functions to be traced are stored in a group of pages and added to the function list at the end. We just need to keep a pointer to the first page of the list that was added, and use that to know where to start on the list for initializing functions. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e7218869fe9..366d7881f188 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -977,8 +977,6 @@ static struct ftrace_ops global_ops = { .filter_hash = EMPTY_HASH, }; -static struct dyn_ftrace *ftrace_new_addrs; - static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { @@ -988,6 +986,8 @@ struct ftrace_page { int size; }; +static struct ftrace_page *ftrace_new_pgs; + #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -1445,8 +1445,6 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->newlist = ftrace_new_addrs; - ftrace_new_addrs = rec; return rec; } @@ -1936,9 +1934,11 @@ static int ops_traces_mod(struct ftrace_ops *ops) static int ftrace_update_code(struct module *mod) { + struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + int i; /* * When adding a module, we need to check if tracers are @@ -1960,41 +1960,44 @@ static int ftrace_update_code(struct module *mod) start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - while (ftrace_new_addrs) { + for (pg = ftrace_new_pgs; pg; pg = pg->next) { - /* If something went wrong, bail without enabling anything */ - if (unlikely(ftrace_disabled)) - return -1; + for (i = 0; i < pg->index; i++) { + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; - p = ftrace_new_addrs; - ftrace_new_addrs = p->newlist; - p->flags = ref; + p = &pg->records[i]; + p->flags = ref; - /* - * Do the initial record conversion from mcount jump - * to the NOP instructions. - */ - if (!ftrace_code_disable(mod, p)) - break; + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) + break; - ftrace_update_cnt++; + ftrace_update_cnt++; - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && ref) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p->ip); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) + ftrace_bug(failed, p->ip); + } } } + ftrace_new_pgs = NULL; + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; @@ -3632,6 +3635,9 @@ static int ftrace_process_locs(struct module *mod, break; } + /* These new locations need to be initialized */ + ftrace_new_pgs = pg; + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3 From 68950619f8c82e468d8976130462617abea605a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 17:06:45 -0500 Subject: ftrace: Sort the mcount records on each page Sort records by ip locations of the ftrace mcount calls on each of the set of pages in the function list. This helps in localizing cache usuage when updating the function locations, as well as gives us the ability to quickly find an ip location in the list. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 366d7881f188..2d6f8bcd1884 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -3575,6 +3576,29 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } +static void ftrace_swap_recs(void *a, void *b, int size) +{ + struct dyn_ftrace *reca = a; + struct dyn_ftrace *recb = b; + struct dyn_ftrace t; + + t = *reca; + *reca = *recb; + *recb = t; +} + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -3638,6 +3662,11 @@ static int ftrace_process_locs(struct module *mod, /* These new locations need to be initialized */ ftrace_new_pgs = pg; + /* Make each individual set of pages sorted by ips */ + for (; pg; pg = pg->next) + sort(pg->records, pg->index, sizeof(struct dyn_ftrace), + ftrace_cmp_recs, ftrace_swap_recs); + /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt -- cgit v1.2.3 From 5855fead9cc358adebd6bdeec202d040c623ae38 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 19:27:42 -0500 Subject: ftrace: Use bsearch to find record ip Now that each set of pages in the function list are sorted by ip, we can use bsearch to find a record within each set of pages. This speeds up the ftrace_location() function by magnitudes. For archs (like x86) that need to add a breakpoint at every function that will be converted from a nop to a callback and vice versa, the breakpoint callback needs to know if the breakpoint was for ftrace or not. It requires finding the breakpoint ip within the records. Doing a linear search is extremely inefficient. It is a must to be able to do a fast binary search to find these locations. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d6f8bcd1884..dcd3a814d39b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1300,6 +1301,19 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) } \ } + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *reca = a; + const struct dyn_ftrace *recb = b; + + if (reca->ip > recb->ip) + return 1; + if (reca->ip < recb->ip) + return -1; + return 0; +} + /** * ftrace_location - return true if the ip giving is a traced location * @ip: the instruction pointer to check @@ -1313,11 +1327,17 @@ int ftrace_location(unsigned long ip) { struct ftrace_page *pg; struct dyn_ftrace *rec; + struct dyn_ftrace key; - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip == ip) + key.ip = ip; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) return 1; - } while_for_each_ftrace_rec(); + } return 0; } @@ -3587,18 +3607,6 @@ static void ftrace_swap_recs(void *a, void *b, int size) *recb = t; } -static int ftrace_cmp_recs(const void *a, const void *b) -{ - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; - - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) - return -1; - return 0; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) -- cgit v1.2.3 From c842e975520f8ab09e293cc92f51a1f396251fd5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 18:44:44 -0500 Subject: ftrace: Fix ftrace hash record update with notrace When disabling the "notrace" records, that means we want to trace them. If the notrace_hash is zero, it means that we want to trace all records. But to disable a zero notrace_hash means nothing. The check for the notrace_hash count was incorrect with: if (hash && !hash->count) return With the correct comment above it that states that we do nothing if the notrace_hash has zero count. But !hash also means that the notrace hash has zero count. I think this was done to protect against dereferencing NULL. But if !hash is true, then we go through the following loop without doing a single thing. Fix it to: if (!hash || !hash->count) return; Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcd3a814d39b..a383d6c67bfa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1381,7 +1381,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (hash && !hash->count) + if (!hash || !hash->count) return; } -- cgit v1.2.3 From 06a51d9307380c78bb5c92e68fc80ad2c7d7f890 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 19:07:36 -0500 Subject: ftrace: Create ftrace_hash_empty() helper routine There are two types of hashes in the ftrace_ops; one type is the filter_hash and the other is the notrace_hash. Either one may be null, meaning it has no elements. But when elements are added, the hash is allocated. Throughout the code, a check needs to be made to see if a hash exists or the hash has elements, but the check if the hash exists is usually missing causing the possible "NULL pointer dereference bug". Add a helper routine called "ftrace_hash_empty()" that returns true if the hash doesn't exist or its count is zero. As they mean the same thing. Last-bug-reported-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a383d6c67bfa..e1ee07f81ca2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -999,6 +999,11 @@ static struct ftrace_page *ftrace_new_pgs; static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; +static bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + static struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { @@ -1007,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) struct hlist_head *hhd; struct hlist_node *n; - if (!hash->count) + if (ftrace_hash_empty(hash)) return NULL; if (hash->size_bits > 0) @@ -1151,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; /* Empty hash? */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return new_hash; size = 1 << hash->size_bits; @@ -1276,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); - if ((!filter_hash || !filter_hash->count || + if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && - (!notrace_hash || !notrace_hash->count || + (ftrace_hash_empty(notrace_hash) || !ftrace_lookup_ip(notrace_hash, ip))) ret = 1; else @@ -1371,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; @@ -1381,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (!hash || !hash->count) + if (ftrace_hash_empty(hash)) return; } @@ -1398,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1407,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash && in_hash && !in_other_hash) match = 1; else if (!filter_hash && in_hash && - (in_other_hash || !other_hash->count)) + (in_other_hash || ftrace_hash_empty(other_hash))) match = 1; } if (!match) @@ -1950,7 +1955,7 @@ static int ops_traces_mod(struct ftrace_ops *ops) struct ftrace_hash *hash; hash = ops->filter_hash; - return !!(!hash || !hash->count); + return ftrace_hash_empty(hash); } static int ftrace_update_code(struct module *mod) @@ -2320,7 +2325,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (iter->flags & FTRACE_ITER_FILTER && + ftrace_hash_empty(ops->filter_hash)) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; -- cgit v1.2.3 From fc13cb0ce45296f331263a6034aa1814203e1ac3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:41:25 -0500 Subject: ftrace: Allow other users of function tracing to use the output listing The function tracer is set up to allow any other subsystem (like perf) to use it. Ftrace already has a way to list what functions are enabled by the global_ops. It would be very helpful to let other users of the function tracer to be able to use the same code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e1ee07f81ca2..5b105c5ddc0c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2134,14 +2134,6 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) return 0; } -enum { - FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_HASH = (1 << 3), - FTRACE_ITER_ENABLED = (1 << 4), -}; - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2249,7 +2241,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -2305,7 +2297,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = &global_ops; + struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -2414,6 +2406,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2442,6 +2435,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) iter->pg = ftrace_pages_start; iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -2462,7 +2456,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) mutex_unlock(&ftrace_lock); } -static int +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * ftrace_regex_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { @@ -2542,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static loff_t +loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -3095,14 +3105,14 @@ out_unlock: return ret; } -static ssize_t +ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return ftrace_regex_write(file, ubuf, cnt, ppos, 1); } -static ssize_t +ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3292,8 +3302,7 @@ static void __init set_ftrace_early_filters(void) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } -static int -ftrace_regex_release(struct inode *inode, struct file *file) +int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; -- cgit v1.2.3 From 69a3083c4a7df0322d97bb2b43a33cb12af8131a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 15:21:16 -0500 Subject: ftrace: Decouple hash items from showing filtered functions The set_ftrace_filter shows "hashed" functions, which are functions that are added with operations to them (like traceon and traceoff). As other subsystems may be able to show what functions they are using for function tracing, the hash items should no longer be shown just because the FILTER flag is set. As they have nothing to do with other subsystems filters. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b105c5ddc0c..5728d9aa632e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2198,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; + if (!(iter->flags & FTRACE_ITER_DO_HASH)) + return NULL; + if (iter->func_pos > *pos) return NULL; @@ -2343,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p) { - if (iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); - - return NULL; - } + if (!p) + return t_hash_start(m, pos); return iter; } @@ -2541,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, - inode, file); + return ftrace_regex_open(&global_ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + inode, file); } static int -- cgit v1.2.3 From d2d45c7a03a2b1a14159cbb665e9dd60991a7d4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 14:44:09 -0500 Subject: tracing: Have stack_tracer use a separate list of functions The stack_tracer is used to look at every function and check if the current stack is bigger than the last recorded max stack size. When a new max is found, then it saves that stack off. Currently the stack tracer is limited by the global_ops of the function tracer. As the stack tracer has nothing to do with the ftrace function tracer, except that it uses it as its internal engine, the stack tracer should have its own list. A new file is added to the tracing debugfs directory called: stack_trace_filter that can be used to select which functions you want to check the stack on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d97..0398b7c7afd6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,7 +133,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t @@ -311,6 +310,21 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -358,6 +372,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); + trace_create_file("stack_trace_filter", 0444, d_tracer, + NULL, &stack_trace_filter_fops); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3 From 2a85a37f168d2b4d74d493b578af4dc9032be92e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 21:57:44 -0500 Subject: ftrace: Allow access to the boot time function enabling Change set_ftrace_early_filter() to ftrace_set_early_filter() and make it a global function. This will allow other subsystems in the kernel to be able to enable function tracing at start up and reuse the ftrace function parsing code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5728d9aa632e..683d559a0eef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3279,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init -set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; @@ -3293,9 +3293,9 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); -- cgit v1.2.3 From 762e1207889b3451c50d365b741af6f9ce958886 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 19 Dec 2011 22:01:00 -0500 Subject: tracing: Have stack tracing set filtered functions at boot Add stacktrace_filter= to the kernel command line that lets the user pick specific functions to check the stack on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0398b7c7afd6..d4545f49242e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,9 @@ #include #include #include + +#include + #include "trace.h" #define STACK_TRACE_ENTRIES 500 @@ -352,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + static __init int enable_stacktrace(char *str) { + if (strncmp(str, "_filter=", 8) == 0) + strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + stack_tracer_enabled = 1; last_stack_tracer_enabled = 1; return 1; @@ -375,6 +383,9 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace_filter", 0444, d_tracer, NULL, &stack_trace_filter_fops); + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3 From 38b78eb855409a05f9d370228bec1955e6878e08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Dec 2011 14:31:35 -0800 Subject: tracing: Factorize filter creation There are four places where new filter for a given filter string is created, which involves several different steps. This patch factors those steps into create_[system_]filter() functions which in turn make use of create_filter_{start|finish}() for common parts. The only functional change is that if replace_filter_string() is requested and fails, creation fails without any side effect instead of being ignored. Note that system filter is now installed after the processing is complete which makes freeing before and then restoring filter string on error unncessary. -v2: Rebased to resolve conflict with 49aa29513e and updated both create_filter() functions to always set *filterp instead of requiring the caller to clear it to %NULL on entry. Link: http://lkml.kernel.org/r/1323988305-1469-2-git-send-email-tj@kernel.org Signed-off-by: Tejun Heo Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 283 +++++++++++++++++++------------------ 1 file changed, 142 insertions(+), 141 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f04cc3136bd3..24aee7127451 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system, return -ENOMEM; } +static int create_filter_start(char *filter_str, bool set_str, + struct filter_parse_state **psp, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_state *ps = NULL; + int err = 0; + + WARN_ON_ONCE(*psp || *filterp); + + /* allocate everything, and if any fails, free all and fail */ + filter = __alloc_filter(); + if (filter && set_str) + err = replace_filter_string(filter, filter_str); + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + + if (!filter || !ps || err) { + kfree(ps); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *psp = ps; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err && set_str) + append_filter_err(ps, filter); + return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ + if (ps) { + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + } +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, set_str, &ps, &filter); + if (!err) { + err = replace_preds(call, filter, ps, filter_str, false); + if (err && set_str) + append_filter_err(ps, filter); + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, + char *filter_str, struct event_filter **filterp) +{ + struct event_filter *filter = NULL; + struct filter_parse_state *ps = NULL; + int err; + + err = create_filter_start(filter_str, true, &ps, &filter); + if (!err) { + err = replace_system_preds(system, ps, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + } else { + append_filter_err(ps, filter); + } + } + create_filter_finish(ps); + + *filterp = filter; + return err; +} + int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; - struct event_filter *tmp; int err = 0; mutex_lock(&event_mutex); @@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) { - kfree(ps); - goto out_unlock; - } - - replace_filter_string(filter, filter_string); - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) { - append_filter_err(ps, filter); - goto out; - } + err = create_filter(call, filter_string, true, &filter); - err = replace_preds(call, filter, ps, filter_string, false); - if (err) { - filter_disable(call); - append_filter_err(ps, filter); - } else - call->flags |= TRACE_EVENT_FL_FILTERED; -out: /* * Always swap the call filter with the new filter * even if there was an error. If there was an error * in the filter, we disable the filter and show the error * string */ - tmp = call->filter; - rcu_assign_pointer(call->filter, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - synchronize_sched(); - __free_filter(tmp); + if (filter) { + struct event_filter *tmp = call->filter; + + if (!err) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + filter_disable(call); + + rcu_assign_pointer(call->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } } - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); out_unlock: mutex_unlock(&event_mutex); @@ -1811,7 +1902,6 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - struct filter_parse_state *ps; struct event_filter *filter; int err = 0; @@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; } - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto out_unlock; - - filter = __alloc_filter(); - if (!filter) - goto out; - - /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; - - /* - * No event actually uses the system filter - * we can free it without synchronize_sched(). - */ - __free_filter(system->filter); - system->filter = filter; - - parse_init(ps, filter_ops, filter_string); - err = filter_parse(ps); - if (err) - goto err_filter; - - err = replace_system_preds(system, ps, filter_string); - if (err) - goto err_filter; - -out: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); + err = create_system_filter(system, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + } out_unlock: mutex_unlock(&event_mutex); return err; - -err_filter: - replace_filter_string(filter, filter_string); - append_filter_err(ps, system->filter); - goto out; } #ifdef CONFIG_PERF_EVENTS @@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, { int err; struct event_filter *filter; - struct filter_parse_state *ps; struct ftrace_event_call *call; mutex_lock(&event_mutex); @@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_filter(); - if (!filter) { - err = PTR_ERR(filter); - goto out_unlock; - } - - err = -ENOMEM; - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); + err = create_filter(call, filter_str, false, &filter); if (!err) event->filter = filter; - -free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - -free_filter: - if (err) + else __free_filter(filter); out_unlock: @@ -1954,43 +1991,6 @@ out_unlock: #define CREATE_TRACE_POINTS #include "trace_events_filter_test.h" -static int test_get_filter(char *filter_str, struct ftrace_event_call *call, - struct event_filter **pfilter) -{ - struct event_filter *filter; - struct filter_parse_state *ps; - int err = -ENOMEM; - - filter = __alloc_filter(); - if (!filter) - goto out; - - ps = kzalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - goto free_filter; - - parse_init(ps, filter_ops, filter_str); - err = filter_parse(ps); - if (err) - goto free_ps; - - err = replace_preds(call, filter, ps, filter_str, false); - if (!err) - *pfilter = filter; - - free_ps: - filter_opstack_clear(ps); - postfix_clear(ps); - kfree(ps); - - free_filter: - if (err) - __free_filter(filter); - - out: - return err; -} - #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ { \ .filter = FILTER, \ @@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = test_get_filter(d->filter, &event_ftrace_test_filter, - &filter); + err = create_filter(&event_ftrace_test_filter, d->filter, + false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", d->filter, err); + __free_filter(filter); break; } -- cgit v1.2.3 From 7e381b0eb1e1a9805c37335562e8dc02e7d7848c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Dec 2011 20:03:19 +0100 Subject: cgroup: Drop task_lock(parent) on cgroup_fork() We don't need to hold the parent task_lock() on the parent in cgroup_fork() because we are already synchronized against the two places that may change the parent css_set concurrently: - cgroup_exit(), but the parent obviously can't exit concurrently - cgroup migration: we are synchronized against threadgroup_lock() So we can safely remove the task_lock() there. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Signed-off-by: Tejun Heo Cc: Containers Cc: Cgroups Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage Cc: Mandeep Singh Baines --- kernel/cgroup.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc3caff138d8..dae50d0d8e4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4556,20 +4556,31 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. + * it was not made under the protection of RCU, cgroup_mutex or + * threadgroup_change_begin(), so it might no longer be a valid + * cgroup pointer. cgroup_attach_task() might have already changed + * current->cgroups, allowing the previously referenced cgroup + * group to be removed and freed. + * + * Outside the pointer validity we also need to process the css_set + * inheritance between threadgoup_change_begin() and + * threadgoup_change_end(), this way there is no leak in any process + * wide migration performed by cgroup_attach_proc() that could otherwise + * miss a thread because it is too early or too late in the fork stage. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); + /* + * We don't need to task_lock() current because current->cgroups + * can't be changed concurrently here. The parent obviously hasn't + * exited and called cgroup_exit(), and we are synchronized against + * cgroup migration through threadgroup_change_begin(). + */ child->cgroups = current->cgroups; get_css_set(child->cgroups); - task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } -- cgit v1.2.3 From c84cdf75ccb2845f690579e838f13f7e744e3d23 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Dec 2011 20:03:18 +0100 Subject: cgroup: Remove unnecessary task_lock before fetching css_set on migration When we fetch the css_set of the tasks on cgroup migration, we don't need anymore to synchronize against cgroup_exit() that could swap the old one with init_css_set. Now that we are using threadgroup_lock() during the migrations, we don't need to worry about it anymore. Signed-off-by: Frederic Weisbecker Reviewed-by: Mandeep Singh Baines Reviewed-by: Li Zefan Signed-off-by: Tejun Heo Cc: Containers Cc: Cgroups Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dae50d0d8e4b..4936d8886b4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1850,14 +1850,14 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct css_set *newcg; /* - * get old css_set. we need to take task_lock and refcount it, because - * an exiting task can change its css_set to init_css_set and drop its - * old one without taking cgroup_mutex. + * get old css_set. We are synchronized through threadgroup_lock() + * against PF_EXITING setting such that we can't race against + * cgroup_exit() changing the css_set to init_css_set and dropping the + * old one. */ - task_lock(tsk); + WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; get_css_set(oldcg); - task_unlock(tsk); /* locate or allocate a new css_set for this task. */ if (guarantee) { @@ -1879,9 +1879,7 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } put_css_set(oldcg); - /* @tsk can't exit as its threadgroup is locked */ task_lock(tsk); - WARN_ON_ONCE(tsk->flags & PF_EXITING); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -2182,11 +2180,13 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* nothing to do if this task is already in the cgroup */ if (tc->cgrp == cgrp) continue; - /* get old css_set pointer */ - task_lock(tc->task); + /* + * get old css_set pointer. threadgroup is locked so this is + * safe against concurrent cgroup_exit() changing this to + * init_css_set. + */ oldcg = tc->task->cgroups; get_css_set(oldcg); - task_unlock(tc->task); /* see if the new one for us is already in the list? */ if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) { /* was already there, nothing to do. */ -- cgit v1.2.3 From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 14:29:42 -0800 Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem and converts the devices to regular devices. The sysdev drivers are implemented as subsystem interfaces now. After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Userspace relies on events and generic sysfs subsystem infrastructure from sysdev devices, which are made available with this conversion. Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Tigran Aivazian Cc: Len Brown Cc: Zhang Rui Cc: Dave Jones Cc: Peter Zijlstra Cc: Russell King Cc: Andrew Morton Cc: Arjan van de Ven Cc: "Rafael J. Wysocki" Cc: "Srivatsa S. Bhat" Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/sched.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0e9344a71be3..530772646443 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7923,54 +7923,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_mc_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_mc_power_savings); + return sprintf(buf, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, - struct sysdev_class_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); +static DEVICE_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, - char *page) +static ssize_t sched_smt_power_savings_show(struct device *dev, + struct device_attribute *attr, + char *buf) { - return sprintf(page, "%u\n", sched_smt_power_savings); + return sprintf(buf, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, - struct sysdev_class_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, +static DEVICE_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct device *dev) { int err = 0; #ifdef CONFIG_SCHED_SMT if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_smt_power_savings); #endif #ifdef CONFIG_SCHED_MC if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); + err = device_create_file(dev, &dev_attr_sched_mc_power_savings); #endif return err; } -- cgit v1.2.3 From 7239f65cf364180cdb100a4ed211b2a9f9a72119 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 16:12:37 -0800 Subject: clockevents: remove sysdev.h This isn't needed in the clockevents.c file, and the header file is going away soon, so just remove the #include Cc: Thomas Gleixner Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/time/clockevents.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6c..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "tick-internal.h" -- cgit v1.2.3 From 026085ef5ae07c3197f2baacc091ce067b86ed11 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 21 Dec 2011 20:18:35 -0800 Subject: cgroup: remove redundant get/put of old css_set from migrate We can now assume that the css_set reference held by the task will not go away for an exiting task. PF_EXITING state can be trusted throughout migration by checking it after locking threadgroup. Changes in V4: * https://lkml.org/lkml/2011/12/20/368 (Tejun Heo) * Fix typo in commit message * Undid the rename of css_set_check_fetched * https://lkml.org/lkml/2011/12/20/427 (Li Zefan) * Fix comment in cgroup_task_migrate() Changes in V3: * https://lkml.org/lkml/2011/12/20/255 (Frederic Weisbecker) * Fixed to put error in retval Changes in V2: * https://lkml.org/lkml/2011/12/19/289 (Tejun Heo) * Updated commit message -tj: removed stale patch description about dropped function rename. Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4936d8886b4f..82288088f6a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1850,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct css_set *newcg; /* - * get old css_set. We are synchronized through threadgroup_lock() - * against PF_EXITING setting such that we can't race against - * cgroup_exit() changing the css_set to init_css_set and dropping the - * old one. + * We are synchronized through threadgroup_lock() against PF_EXITING + * setting such that we can't race against cgroup_exit() changing the + * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - get_css_set(oldcg); /* locate or allocate a new css_set for this task. */ if (guarantee) { @@ -1872,12 +1870,9 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, might_sleep(); /* find_css_set will give us newcg already referenced. */ newcg = find_css_set(oldcg, cgrp); - if (!newcg) { - put_css_set(oldcg); + if (!newcg) return -ENOMEM; - } } - put_css_set(oldcg); task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); @@ -2186,18 +2181,11 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * init_css_set. */ oldcg = tc->task->cgroups; - get_css_set(oldcg); - /* see if the new one for us is already in the list? */ - if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) { - /* was already there, nothing to do. */ - put_css_set(oldcg); - } else { - /* we don't already have it. get new one. */ - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - put_css_set(oldcg); - if (retval) + + /* if we don't already have it in the list get a new one */ + if (!css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) + if (retval = css_set_prefetch(cgrp, oldcg, &newcg_list)) goto out_list_teardown; - } } /* -- cgit v1.2.3 From b07ef7741122a83575499c11417e514877941e76 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 21 Dec 2011 20:18:36 -0800 Subject: cgroup: remove redundant get/put of task struct threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. Changes in V2: * https://lkml.org/lkml/2011/12/20/369 (Tejun Heo) * Undo incorrect removal of get/put from attach_task_by_pid() * Author * Remove a comment which is made stale by this change Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 82288088f6a5..a85a7002ca33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2116,7 +2116,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = -EAGAIN; goto out_free_group_list; } - /* take a reference on each task in the group to go in the array. */ + tsk = leader; i = nr_migrating_tasks = 0; do { @@ -2128,7 +2128,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - get_task_struct(tsk); /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2150,7 +2149,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* methods shouldn't be called if no task is actually migrating */ retval = 0; if (!nr_migrating_tasks) - goto out_put_tasks; + goto out_free_group_list; /* * step 1: check that we can legitimately attach to the cgroup. @@ -2234,12 +2233,6 @@ out_cancel_attach: ss->cancel_attach(ss, cgrp, &tset); } } -out_put_tasks: - /* clean up the array of referenced threads in the group. */ - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - put_task_struct(tc->task); - } out_free_group_list: flex_array_free(group); return retval; -- cgit v1.2.3 From 892a2b90ba15cb7dbee40979f23fdb492913abf8 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 21 Dec 2011 20:18:37 -0800 Subject: cgroup: only need to check oldcgrp==newgrp once In cgroup_attach_proc it is now sufficient to only check that oldcgrp==newcgrp once. Now that we are using threadgroup_lock() during the migrations, oldcgrp will not change. Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a85a7002ca33..1042b3c41314 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2067,7 +2067,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, */ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { - int retval, i, group_size, nr_migrating_tasks; + int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ struct css_set *oldcg; @@ -2118,7 +2118,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } tsk = leader; - i = nr_migrating_tasks = 0; + i = 0; do { struct task_and_cgroup ent; @@ -2134,11 +2134,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); + /* nothing to do if this task is already in the cgroup */ + if (ent.cgrp == cgrp) + continue; retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - if (ent.cgrp != cgrp) - nr_migrating_tasks++; } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; @@ -2148,7 +2149,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* methods shouldn't be called if no task is actually migrating */ retval = 0; - if (!nr_migrating_tasks) + if (!group_size) goto out_free_group_list; /* @@ -2171,14 +2172,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - /* nothing to do if this task is already in the cgroup */ - if (tc->cgrp == cgrp) - continue; - /* - * get old css_set pointer. threadgroup is locked so this is - * safe against concurrent cgroup_exit() changing this to - * init_css_set. - */ oldcg = tc->task->cgroups; /* if we don't already have it in the list get a new one */ @@ -2194,9 +2187,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - /* leave current thread as it is if it's already there */ - if (tc->cgrp == cgrp) - continue; retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); BUG_ON(retval); } -- cgit v1.2.3 From c87fb57346fc7653ace98769f148e0dcd88ac1ee Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Wed, 14 Dec 2011 23:43:16 +0100 Subject: ARM: 7235/1: irqdomain: export irq_domain_simple_ops for !CONFIG_OF irqdomain support is used in interrupt controller drivers that may not have device tree support but only need the basic HW->Linux irq translation. Rather than having each of these implement their own IRQ domain, allow them to use the simple ops. Acked-by: Thomas Gleixner Acked-by: Rob Herring Cc: Grant Likely Signed-off-by: Jamie Iles Signed-off-by: Russell King --- kernel/irq/irqdomain.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 200ce832c585..7ca523b249ef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return 0; } -struct irq_domain_ops irq_domain_simple_ops = { - .dt_translate = irq_domain_simple_dt_translate, -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - /** * irq_domain_create_simple() - Set up a 'simple' translation range */ @@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ + +struct irq_domain_ops irq_domain_simple_ops = { +#ifdef CONFIG_OF_IRQ + .dt_translate = irq_domain_simple_dt_translate, +#endif /* CONFIG_OF_IRQ */ +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit v1.2.3 From a65cf5181aa608addcb2873c8ed90413a9f539cb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Nov 2011 20:39:59 +0800 Subject: jump-label: export jump_label_inc/jump_label_dec Export these two symbols, they will be used by KVM mmu audit Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- kernel/jump_label.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 66ff7109f697..2af9027106a8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key) atomic_inc(&key->enabled); jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_inc); void jump_label_dec(struct jump_label_key *key) { @@ -80,6 +81,7 @@ void jump_label_dec(struct jump_label_key *key) jump_label_update(key, JUMP_LABEL_DISABLE); jump_label_unlock(); } +EXPORT_SYMBOL_GPL(jump_label_dec); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { -- cgit v1.2.3 From 1c6c3fad81787e8cb4c85ddfd573b0d8442fe630 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 27 Dec 2011 07:46:25 +0200 Subject: cgroup: mark cgroup_rmdir_waitq and cgroup_attach_proc() as static Signed-off-by: Kirill A. Shutemov Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1042b3c41314..421557fcbfe4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -938,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ -DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); +static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { @@ -2065,7 +2065,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, * Call holding cgroup_mutex and the group_rwsem of the leader. Will take * task_lock of each thread in leader's threadgroup individually in turn. */ -int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; -- cgit v1.2.3 From c6ca57500c23d57a4ccec9874b6a3c99c297e1b5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 27 Dec 2011 07:46:26 +0200 Subject: cgroup: add sparse annotation to cgroup_iter_start() and cgroup_iter_end() Signed-off-by: Kirill A. Shutemov Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 421557fcbfe4..c6bd67b3fcf6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2825,6 +2825,7 @@ static void cgroup_enable_task_cg_lists(void) } void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) + __acquires(css_set_lock) { /* * The first time anyone tries to iterate across a cgroup, @@ -2864,6 +2865,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, } void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) + __releases(css_set_lock) { read_unlock(&css_set_lock); } -- cgit v1.2.3 From 7e3aa30ac8c904a706518b725c451bb486daaae9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 Dec 2011 04:25:23 +0100 Subject: cgroup: Remove task_lock() from cgroup_post_fork() cgroup_post_fork() is protected between threadgroup_change_begin() and threadgroup_change_end() against concurrent changes of the child's css_set in cgroup_task_migrate(). Also the child can't exit and call cgroup_exit() at this stage, this means it's css_set can't be changed with init_css_set concurrently. For these reasons, we don't need to hold task_lock() on the child because it's css_set can only remain stable in this place. Let's remove the lock there. v2: Update comment to explain that we are safe against cgroup_exit() Signed-off-by: Frederic Weisbecker Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Containers Cc: Cgroups Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage Cc: Mandeep Singh Baines --- kernel/cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c6bd67b3fcf6..548d8d4e86d0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4595,10 +4595,19 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); - task_lock(child); - if (list_empty(&child->cg_list)) + if (list_empty(&child->cg_list)) { + /* + * It's safe to use child->cgroups without task_lock() + * here because we are protected through + * threadgroup_change_begin() against concurrent + * css_set change in cgroup_task_migrate(). Also + * the task can't exit at that point until + * wake_up_new_task() is called, so we are protected + * against cgroup_exit() setting child->cgroup to + * init_css_set. + */ list_add(&child->cg_list, &child->cgroups->tasks); - task_unlock(child); + } write_unlock(&css_set_lock); } } -- cgit v1.2.3 From 93797d87d63d36404907640e4e20bb976bff4744 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 12 Dec 2011 09:59:14 -0600 Subject: irq: check domain hwirq range for DT translate A DT node may have more than 1 domain associated with it, so make sure the hwirq number is within range when doing DT translation. Signed-off-by: Rob Herring Acked-by: Grant Likely Acked-by: Shawn Guo Cc: Thomas Gleixner --- kernel/irq/irqdomain.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 200ce832c585..8bd7479b8390 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return -EINVAL; if (intsize < 1) return -EINVAL; + if (d->nr_irq && ((intspec[0] < d->hwirq_base) || + (intspec[0] >= d->hwirq_base + d->nr_irq))) + return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; -- cgit v1.2.3 From d36b691077dc59c74efec0d54ed21b86f7a2a21a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Dec 2011 17:09:01 -0500 Subject: misc latin1 to utf8 conversions Signed-off-by: Al Viro Signed-off-by: Jiri Kosina --- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..f641547beb76 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..809c8ec5d42a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ -- cgit v1.2.3 From 32dc730860155b235f13e0cd3fe58b263279baf9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 20:08:42 -0500 Subject: get rid of timer in kern/acct.c ... and clean it up a bit, while we are at it Signed-off-by: Al Viro --- kernel/acct.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2ddc..8cba12429d82 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,26 +84,16 @@ static void do_acct_process(struct bsd_acct_struct *acct, * the cache line to have the data after getting the lock. */ struct bsd_acct_struct { - volatile int active; - volatile int needcheck; + int active; + unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct timer_list timer; struct list_head list; }; static DEFINE_SPINLOCK(acct_lock); static LIST_HEAD(acct_list); -/* - * Called whenever the timer says to check the free space. - */ -static void acct_timeout(unsigned long x) -{ - struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; - acct->needcheck = 1; -} - /* * Check the amount of free space and suspend/resume accordingly. */ @@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) struct kstatfs sbuf; int res; int act; - sector_t resume; - sector_t suspend; + u64 resume; + u64 suspend; spin_lock(&acct_lock); res = acct->active; - if (!file || !acct->needcheck) + if (!file || time_is_before_jiffies(acct->needcheck)) goto out; spin_unlock(&acct_lock); @@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; - sector_div(suspend, 100); - sector_div(resume, 100); + do_div(suspend, 100); + do_div(resume, 100); if (sbuf.f_bavail <= suspend) act = -1; @@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) } } - del_timer(&acct->timer); - acct->needcheck = 0; - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; res = acct->active; out: spin_unlock(&acct_lock); @@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (acct->file) { old_acct = acct->file; old_ns = acct->ns; - del_timer(&acct->timer); acct->active = 0; - acct->needcheck = 0; acct->file = NULL; acct->ns = NULL; list_del(&acct->list); @@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = 0; + acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; acct->active = 1; list_add(&acct->list, &acct_list); - /* It's been deleted if it was used before so this is safe */ - setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); - acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct->timer); } if (old_acct) { mnt_unpin(old_acct->f_path.mnt); @@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns) if (acct == NULL) return; - del_timer_sync(&acct->timer); spin_lock(&acct_lock); if (acct->file != NULL) acct_file_reopen(acct, NULL, NULL); @@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset((caddr_t)&ac, 0, sizeof(acct_t)); + memset(&ac, 0, sizeof(acct_t)); ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); -- cgit v1.2.3 From ff01bb4832651c6d25ac509a06a10fcbd75c461c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Sep 2011 02:31:11 -0400 Subject: fs: move code out of buffer.c Move invalidate_bdev, block_sync_page into fs/block_dev.c. Export kill_bdev as well, so brd doesn't have to open code it. Reduce buffer_head.h requirement accordingly. Removed a rather large comment from invalidate_bdev, as it looked a bit obsolete to bother moving. The small comment replacing it says enough. Signed-off-by: Nick Piggin Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/power/swap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11a594c4ba25..3739ecced085 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:41:39 -0400 Subject: switch vfs_mkdir() and ->mkdir() to umode_t vfs_mkdir() gets int, but immediately drops everything that might not fit into umode_t and that's the only caller of ->mkdir()... Signed-off-by: Al Viro --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a184470cf9b5..b37a0ea55114 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -760,7 +760,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -3846,7 +3846,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct cgroup *c_parent = dentry->d_parent->d_fsdata; -- cgit v1.2.3 From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 04:33:43 -0400 Subject: switch debugfs to umode_t Signed-off-by: Al Viro --- kernel/relay.c | 2 +- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 226fade4d727..4335e1d7ee2d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, */ static struct dentry *create_buf_file_default_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 16fc34a0806f..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, - int mode, + umode_t mode, struct rchan_buf *buf, int *is_global) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f..660b069a0f99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4385,7 +4385,7 @@ static const struct file_operations trace_options_core_fops = { }; struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..0154c0b850de 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, - mode_t mode, + umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); -- cgit v1.2.3 From a5e7ed3287e45f2eafbcf9e7e6fdc5a0191acf40 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:55:55 -0400 Subject: cgroup: propagate mode_t Signed-off-by: Al Viro --- kernel/cgroup.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b37a0ea55114..86ebacfd9431 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -775,7 +775,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct cgroup *child); -static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -2585,7 +2585,7 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_create_file(struct dentry *dentry, mode_t mode, +static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { struct inode *inode; @@ -2626,7 +2626,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct dentry *parent; int error = 0; @@ -2653,9 +2653,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, * returns S_IRUGO if it has only a read handler * returns S_IWUSR if it has only a write hander */ -static mode_t cgroup_file_mode(const struct cftype *cft) +static umode_t cgroup_file_mode(const struct cftype *cft) { - mode_t mode = 0; + umode_t mode = 0; if (cft->mode) return cft->mode; @@ -2678,7 +2678,7 @@ int cgroup_add_file(struct cgroup *cgrp, struct dentry *dir = cgrp->dentry; struct dentry *dentry; int error; - mode_t mode; + umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -3752,7 +3752,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) * Must be called with the mutex on the parent inode held */ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - mode_t mode) + umode_t mode) { struct cgroup *cgrp; struct cgroupfs_root *root = parent->root; -- cgit v1.2.3 From 36fcb589e752fa9c71f8a447db94126d102fd937 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 03:47:31 -0400 Subject: sysctl: use umode_t for table permissions Signed-off-by: Al Viro --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d6b149ccf925..e64f45741e0e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6480,7 +6480,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; -- cgit v1.2.3 From df0a42837b86567a130c44515ab620d23e7f182b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 05:26:10 -0400 Subject: switch mq_open() to umode_t --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 47b7fc1ea893..9849213e501c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -234,7 +234,7 @@ struct audit_context { } mq_sendrecv; struct { int oflag; - mode_t mode; + umode_t mode; struct mq_attr attr; } mq_open; struct { @@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic) break; } case AUDIT_MQ_OPEN: { audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, @@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @attr: queue attributes * */ -void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) +void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = current->audit_context; -- cgit v1.2.3 From 2570ebbd1f1ce1ef31f568b0660354fc59424be2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jul 2011 14:03:22 -0400 Subject: switch kern_ipc_perm to umode_t Signed-off-by: Al Viro --- kernel/auditsc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9849213e501c..7a074d65fff4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -210,12 +210,12 @@ struct audit_context { struct { uid_t uid; gid_t gid; - mode_t mode; + umode_t mode; u32 osid; int has_perm; uid_t perm_uid; gid_t perm_gid; - mode_t perm_mode; + umode_t perm_mode; unsigned long qbytes; } ipc; struct { @@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_IPC: { u32 osid = context->ipc.osid; - audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", context->ipc.uid, context->ipc.gid, context->ipc.mode); if (osid) { char *ctx = NULL; @@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic) ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", + "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, @@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Called only after audit_ipc_obj(). */ -void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = current->audit_context; -- cgit v1.2.3 From 93d3a10ef4fdfd4b6d1a3f09b645cd08f74a8115 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Jul 2011 14:04:25 -0400 Subject: auditsc: propage umode_t Signed-off-by: Al Viro --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7a074d65fff4..e7fe2b0d29b3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; - mode_t mode = which & S_IFMT; + umode_t mode = which & S_IFMT; if (unlikely(!ctx)) return 0; @@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->ino != (unsigned long)-1) { audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#o" + " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), -- cgit v1.2.3 From 305f3c8b20ba1ca94829329acdbf22e689304dba Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Jan 2012 10:24:29 +0300 Subject: cgroup: move assignement out of condition in cgroup_attach_proc() Gcc complains about this: "kernel/cgroup.c:2179:4: warning: suggest parentheses around assignment used as truth value [-Wparentheses]" Signed-off-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 548d8d4e86d0..bab5c17e7781 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2175,9 +2175,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcg = tc->task->cgroups; /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) - if (retval = css_set_prefetch(cgrp, oldcg, &newcg_list)) + if (!css_set_check_fetched(cgrp, tc->task, oldcg, + &newcg_list)) { + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + if (retval) goto out_list_teardown; + } } /* -- cgit v1.2.3 From c336078bf65c4d38caa9a4b8b7b7261c778e622c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 27 Dec 2011 22:54:52 +0100 Subject: PM / Hibernate: Implement compat_ioctl for /dev/snapshot This allows uswsusp built for i386 to run on an x86_64 kernel (tested with Debian package version 1.0+20110509-2). References: http://bugs.debian.org/502816 Signed-off-by: Ben Hutchings Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 78bdb4404aab..6b1ab7a88522 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -380,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, return error; } +#ifdef CONFIG_COMPAT + +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + + switch (cmd) { + case SNAPSHOT_GET_IMAGE_SIZE: + case SNAPSHOT_AVAIL_SWAP_SIZE: + case SNAPSHOT_ALLOC_SWAP_PAGE: { + compat_loff_t __user *uoffset = compat_ptr(arg); + loff_t offset; + mm_segment_t old_fs; + int err; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, cmd, (unsigned long) &offset); + set_fs(old_fs); + if (!err && put_user(offset, uoffset)) + err = -EFAULT; + return err; + } + + case SNAPSHOT_CREATE_IMAGE: + return snapshot_ioctl(file, cmd, + (unsigned long) compat_ptr(arg)); + + case SNAPSHOT_SET_SWAP_AREA: { + struct compat_resume_swap_area __user *u_swap_area = + compat_ptr(arg); + struct resume_swap_area swap_area; + mm_segment_t old_fs; + int err; + + err = get_user(swap_area.offset, &u_swap_area->offset); + err |= get_user(swap_area.dev, &u_swap_area->dev); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, + (unsigned long) &swap_area); + set_fs(old_fs); + return err; + } + + default: + return snapshot_ioctl(file, cmd, arg); + } +} + +#endif /* CONFIG_COMPAT */ + static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, @@ -387,6 +448,9 @@ static const struct file_operations snapshot_fops = { .write = snapshot_write, .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = snapshot_compat_ioctl, +#endif }; static struct miscdevice snapshot_device = { -- cgit v1.2.3 From 0d19ea866562e46989412a0676412fa0983c9ce7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 27 Dec 2011 14:25:55 +0800 Subject: cgroup: fix to allow mounting a hierarchy by name If we mount a hierarchy with a specified name, the name is unique, and we can use it to mount the hierarchy without specifying its set of subsystem names. This feature is documented is Documentation/cgroups/cgroups.txt section 2.3 Here's an example: # mount -t cgroup -o cpuset,name=myhier xxx /cgroup1 # mount -t cgroup -o name=myhier xxx /cgroup2 But it was broken by commit 32a8cf235e2f192eb002755076994525cdbaa35a (cgroup: make the mount options parsing more accurate) This fixes the regression. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bab5c17e7781..39c7caef085a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1193,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* * If the 'all' option was specified select all the subsystems, - * otherwise 'all, 'none' and a subsystem name options were not - * specified, let's default to 'all' + * otherwise if 'none', 'name=' and a subsystem name options + * were not specified, let's default to 'all' */ - if (all_ss || (!all_ss && !one_ss && !opts->none)) { + if (all_ss || (!one_ss && !opts->none && !opts->name)) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss == NULL) -- cgit v1.2.3 From b7e724d303b684655e4ca3dabd5a6840ad19012d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: reverse arguments to security_capable security_capable takes ns, cred, cap. But the LSM capable() hook takes cred, ns, cap. The capability helper functions also take cred, ns, cap. Rather than flip argument order just to flip it back, leave them alone. Heck, this should be a little faster since argument will be in the right place! Signed-off-by: Eric Paris --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 283c529f8b1c..d98392719adb 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -374,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(ns, current_cred(), cap) == 0) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } -- cgit v1.2.3 From 2920a8409de5a51575d03deca07e5bb2be6fc98d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: remove all _real_ interfaces The name security_real_capable and security_real_capable_noaudit just don't make much sense to me. Convert them to use security_capable and security_capable_noaudit. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index d98392719adb..ff50ab62cfca 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -298,7 +298,11 @@ error: */ bool has_capability(struct task_struct *t, int cap) { - int ret = security_real_capable(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable(__task_cred(t), &init_user_ns, cap); + rcu_read_unlock(); return (ret == 0); } @@ -317,7 +321,11 @@ bool has_capability(struct task_struct *t, int cap) bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { - int ret = security_real_capable(t, ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable(__task_cred(t), ns, cap); + rcu_read_unlock(); return (ret == 0); } @@ -335,7 +343,11 @@ bool has_ns_capability(struct task_struct *t, */ bool has_capability_noaudit(struct task_struct *t, int cap) { - int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + int ret; + + rcu_read_lock(); + ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap); + rcu_read_unlock(); return (ret == 0); } -- cgit v1.2.3 From 25e75703410a84b80623da3653db6b70282e5c6a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: call has_ns_capability from has_capability Declare the more specific has_ns_capability first in the code and then call it from has_capability. The declaration reversal isn't stricty necessary since they are both declared in header files, but it just makes sense to put more specific functions first in the code. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index ff50ab62cfca..fb815d1b9ea2 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -287,47 +287,41 @@ error: } /** - * has_capability - Does a task have a capability in init_user_ns + * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. + * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability(struct task_struct *t, int cap) +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), &init_user_ns, cap); + ret = security_capable(__task_cred(t), ns, cap); rcu_read_unlock(); return (ret == 0); } /** - * has_capability - Does a task have a capability in a specific user ns + * has_capability - Does a task have a capability in init_user_ns * @t: The task in question - * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to the specified user namespace, false if not. + * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_ns_capability(struct task_struct *t, - struct user_namespace *ns, int cap) +bool has_capability(struct task_struct *t, int cap) { - int ret; - - rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); - rcu_read_unlock(); - - return (ret == 0); + return has_ns_capability(t, &init_user_ns, cap); } /** -- cgit v1.2.3 From 7b61d648499e74dbec3d4ce645675e0ae040ae78 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilites: introduce new has_ns_capabilities_noaudit For consistency in interfaces, introduce a new interface called has_ns_capabilities_noaudit. It checks if the given task has the given capability in the given namespace. Use this new function by has_capabilities_noaudit. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index fb815d1b9ea2..d8398e962470 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -325,27 +325,47 @@ bool has_capability(struct task_struct *t, int cap) } /** - * has_capability_noaudit - Does a task have a capability (unaudited) + * has_ns_capability_noaudit - Does a task have a capability (unaudited) + * in a specific user ns. * @t: The task in question + * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability - * currently in effect to init_user_ns, false if not. Don't write an - * audit message for the check. + * currently in effect to the specified user namespace, false if not. + * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ -bool has_capability_noaudit(struct task_struct *t, int cap) +bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap); + ret = security_capable_noaudit(__task_cred(t), ns, cap); rcu_read_unlock(); return (ret == 0); } +/** + * has_capability_noaudit - Does a task have a capability (unaudited) in the + * initial user ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + return has_ns_capability_noaudit(t, &init_user_ns, cap); +} + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3 From 105ddf49cd301b7929a92f269440e8e562ef19db Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: style only - move capable below ns_capable Although the current code is fine for consistency this moves the capable code below the function it calls in the c file. It doesn't actually change code. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index d8398e962470..5f99e5d68e1f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -366,22 +366,6 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -408,6 +392,22 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + /** * task_ns_capable - Determine whether current task has a superior * capability targeted at a specific task's user namespace. -- cgit v1.2.3 From d2a7009f0bb03fa22ad08dd25472efa0568126b9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabitlies: ns_capable can use the cap helpers rather than lsm call Just to reduce the number of places to change if we every change the LSM hook, use the capability helpers internally when possible. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 5f99e5d68e1f..47626446c39a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (security_capable(current_cred(), ns, cap) == 0) { + if (has_ns_capability(current, ns, cap)) { current->flags |= PF_SUPERPRIV; return true; } -- cgit v1.2.3 From f1c84dae0ecc51aa35c81f19a0ebcd6c0921ddcb Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: capabilities: remove task_ns_* functions task_ in the front of a function, in the security subsystem anyway, means to me at least, that we are operating with that task as the subject of the security decision. In this case what it means is that we are using current as the subject but we use the task to get the right namespace. Who in the world would ever realize that's what task_ns_capability means just by the name? This patch eliminates the task_ns functions entirely and uses the has_ns_capability function instead. This means we explicitly open code the ns in question in the caller. I think it makes the caller a LOT more clear what is going on. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/capability.c | 14 -------------- kernel/ptrace.c | 4 ++-- kernel/sched.c | 2 +- 3 files changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 47626446c39a..74fb3b603045 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -408,20 +408,6 @@ bool capable(int cap) } EXPORT_SYMBOL(capable); -/** - * task_ns_capable - Determine whether current task has a superior - * capability targeted at a specific task's user namespace. - * @t: The task whose user namespace is targeted. - * @cap: The capability in question. - * - * Return true if it does, false otherwise. - */ -bool task_ns_capable(struct task_struct *t, int cap) -{ - return ns_capable(task_cred_xxx(t, user)->user_ns, cap); -} -EXPORT_SYMBOL(task_ns_capable); - /** * nsown_capable - Check superior capability to one's own user_ns * @cap: The capability in question diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a70d2a5d8c7b..210bbf045ee9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -196,7 +196,7 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) + if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -266,7 +266,7 @@ static int ptrace_attach(struct task_struct *task, long request, task->ptrace = PT_PTRACED; if (seize) task->ptrace |= PT_SEIZED; - if (task_ns_capable(task, CAP_SYS_PTRACE)) + if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa9..5670028a9c16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5409,7 +5409,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) + if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); -- cgit v1.2.3 From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: ptrace: do not audit capability check when outputing /proc/pid/stat Reading /proc/pid/stat of another process checks if one has ptrace permissions on that process. If one does have permissions it outputs some data about the process which might have security and attack implications. If the current task does not have ptrace permissions the read still works, but those fields are filled with inocuous (0) values. Since this check and a subsequent denial is not a violation of the security policy we should not audit such denials. This can be quite useful to removing ptrace broadly across a system without flooding the logs when ps is run or something which harmlessly walks proc. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- kernel/ptrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 210bbf045ee9..c890ac9a7962 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -161,6 +161,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } +static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +{ + if (mode & PTRACE_MODE_NOAUDIT) + return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + else + return has_ns_capability(current, ns, CAP_SYS_PTRACE); +} + int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; @@ -187,7 +195,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) cred->gid == tcred->sgid && cred->gid == tcred->gid)) goto ok; - if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + if (ptrace_has_cap(tcred->user->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -196,7 +204,7 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) return -EPERM; return security_ptrace_access_check(task, mode); -- cgit v1.2.3 From fd778461524849afd035679030ae8e8873c72b81 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:16 -0500 Subject: security: remove the security_netlink_recv hook as it is equivalent to capable() Once upon a time netlink was not sync and we had to get the effective capabilities from the skb that was being received. Today we instead get the capabilities from the current task. This has rendered the entire purpose of the hook moot as it is now functionally equivalent to the capable() call. Signed-off-by: Eric Paris --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0a1355ca3d79..f3ba55fa0b70 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: - if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) + if (!capable(CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) + if (!capable(CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ -- cgit v1.2.3 From d8c9584ea2a92879f471fd3a2be3af6c534fb035 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Dec 2011 18:16:57 -0500 Subject: vfs: prefer ->dentry->d_sb to ->mnt->mnt_sb Signed-off-by: Al Viro --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 8cba12429d82..9663eb8058f9 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -315,7 +315,7 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + if (acct->file && acct->file->f_path.dentry->d_sb == sb) { acct_file_reopen(acct, NULL, NULL); goto restart; } -- cgit v1.2.3 From 34c80b1d93e6e20ca9dea0baf583a5b5510d92d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 21:32:45 -0500 Subject: vfs: switch ->show_options() to struct dentry * Signed-off-by: Al Viro --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86ebacfd9431..7cab65f83f1d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1038,9 +1038,9 @@ static int rebind_subsystems(struct cgroupfs_root *root, return 0; } -static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { - struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; + struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); -- cgit v1.2.3 From a0e86bd4252519321b0d102dc4ed90557aa7bee9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 8 Jan 2012 22:44:29 +0100 Subject: audit: always follow va_copy() with va_end() A call to va_copy() should always be followed by a call to va_end() in the same function. In kernel/autit.c::audit_log_vformat() this is not always done. This patch makes sure va_end() is always called. Signed-off-by: Jesper Juhl Cc: Al Viro Cc: Eric Paris Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 09fae2677a45..2c1d6ab7106e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) - goto out; + goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); } - va_end(args2); if (len > 0) skb_put(skb, len); +out_va_end: + va_end(args2); out: return; } -- cgit v1.2.3 From 9b9fb610f6800e0db46cccd8618dd7e609c9bb5a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 10 Jan 2012 09:24:05 +0900 Subject: sched: Remove empty #ifdefs Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F0B8525.8070901@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4dbfd04a2148..457c881873cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7136,10 +7136,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_RT_GROUP_SCHED -#else /* !CONFIG_RT_GROUP_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); @@ -7248,9 +7244,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -#endif - #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { -- cgit v1.2.3 From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:31 -0800 Subject: PM/Hibernate: do not count debug pages as savable When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder > 0, we have lot of free pages that are not marked so. Snapshot code account them as savable, what cause hibernate memory preallocation failure. It is pretty hard to make hibernate allocation succeed with debug_guardpage_minorder=1. This change at least make it possible when system has relatively big amount of RAM. Signed-off-by: Stanislaw Gruszka Acked-by: Rafael J. Wysocki Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c1441392..1cf88900ec4f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } -- cgit v1.2.3 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b00711ce7c13..5e1391b5ade0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: -- cgit v1.2.3 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bb0efa5705ed..d532f1709fbf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2318,6 +2318,27 @@ relock: return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take -- cgit v1.2.3 From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:11:35 -0800 Subject: workqueue: make alloc_workqueue() take printf fmt and args for name alloc_workqueue() currently expects the passed in @name pointer to remain accessible. This is inconvenient and a bit silly given that the whole wq is being dynamically allocated. This patch updates alloc_workqueue() and friends to take printf format string instead of opaque string and matching varargs at the end. The name is allocated together with the wq and formatted. alloc_ordered_workqueue() is converted to a macro to unify varargs handling with alloc_workqueue(), and, while at it, add comment to alloc_workqueue(). None of the current in-kernel users pass in string with '%' as constant name and this change shouldn't cause any problem. [akpm@linux-foundation.org: use __printf] Signed-off-by: Tejun Heo Suggested-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a810..bec7b5b53e03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; -- cgit v1.2.3 From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 10 Jan 2012 15:11:37 -0800 Subject: user namespace: make signal.c respect user namespaces ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's user namespace. (new, thanks Oleg) __send_signal: convert current's uid to the recipient's user namespace for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks again :) do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's user namespace ptrace_signal maps parent's uid into current's user namespace before including in signal to current. IIUC Oleg has argued that this shouldn't matter as the debugger will play with it, but it seems like not converting the value currently being set is misleading. Changelog: Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to simplify callers and help make clear what we are translating (which uid into which namespace). Passing the target task would make callers even easier to read, but we pass in user_ns because current_user_ns() != task_cred_xxx(current, user_ns). Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock in ptrace_signal(). Sep 23: In send_signal(), detect when (user) signal is coming from an ancestor or unrelated user namespace. Pass that on to __send_signal, which sets si_uid to 0 or overflowuid if needed. Oct 12: Base on Oleg's fixup_uid() patch. On top of that, handle all SI_FROMKERNEL cases at callers, because we can't assume sender is current in those cases. Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid Nov 10: (akpm) make the !CONFIG_USER_NS case clearer Signed-off-by: Serge Hallyn Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" From: Serge Hallyn Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2) Eric Biederman pointed out that passing info is a bug and could lead to a NULL pointer deref to boot. A collection of signal, securebits, filecaps, cap_bounds, and a few other ltp tests passed with this kernel. Changelog: Nov 18: previous patch missed a leading '&' Signed-off-by: Serge Hallyn Cc: "Eric W. Biederman" From: Dan Carpenter Subject: ipc/mqueue: lock() => unlock() typo There was a double lock typo introduced in b085f4bd6b21 "user namespace: make signal.c respect user namespaces" Signed-off-by: Dan Carpenter Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d532f1709fbf..c73c4284160e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ -- cgit v1.2.3 From bced76aeaca03b45e3b4bdb868cada328e497847 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Jan 2012 13:11:12 +0100 Subject: sched: Fix lockup by limiting load-balance retries on lock-break Eric and David reported dead machines and traced it to commit a195f004 ("sched: Fix load-balance lock-breaking"), it turns out there's still a scenario where we can end up re-trying forever. Since there is no strict forward progress guarantee in the load-balance iteration we can get stuck re-retrying the same task-set over and over. Creating a forward progress guarantee with the existing structure is somewhat non-trivial, for now simply terminate the retry loop after a few tries. Reported-by: Eric Dumazet Tested-by: Eric Dumazet Reported-by: David Ahern [ logic cleanup as suggested by Eric ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Frederic Weisbecker Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1326297936.2442.157.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e42de9105f8..84adb2d66cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) } #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -4508,7 +4510,9 @@ redo: goto out_balanced; if (lb_flags & LBF_NEED_BREAK) { - lb_flags &= ~LBF_NEED_BREAK; + lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (lb_flags & LBF_ABORT) + goto out_balanced; goto redo; } -- cgit v1.2.3 From 70b1e9161e903a9e1682aca3a832ed29ef876a4d Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sat, 12 Nov 2011 19:08:55 -0800 Subject: module: Add comments describing how the "strmap" logic works Signed-off-by: Kevin Cernekee Signed-off-by: Rusty Russell --- kernel/module.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 178333c48d1e..cf9f1b6b3268 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2193,6 +2193,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); + + /* + * info->strmap has a '1' bit for each byte of .strtab we want to + * keep resident in mod->core_strtab. Everything else in .strtab + * is unreferenced by the symbols in mod->core_symtab, and will be + * discarded when add_kallsyms() compacts the string table. + */ for (ndst = i = 1; i < nsrc; ++i, ++src) if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; @@ -2215,6 +2222,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Append room for core symbols' strings at end of core part. */ info->stroffs = mod->core_size; + + /* First strtab byte (and first symtab entry) are zeroes. */ __set_bit(0, info->strmap); mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -- cgit v1.2.3 From 48fd11880b5ef04270be8a87d9a9a9ee2fdae338 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Fri, 13 Jan 2012 09:32:14 +1030 Subject: module: Fix performance regression on modules with large symbol tables Looking at /proc/kallsyms, one starts to ponder whether all of the extra strtab-related complexity in module.c is worth the memory savings. Instead of making the add_kallsyms() loop even more complex, I tried the other route of deleting the strmap logic and naively copying each string into core_strtab with no consideration for consolidating duplicates. Performance on an "already exists" insmod of nvidia.ko (runs add_kallsyms() but does not actually initialize the module): Original scheme: 1.230s With naive copying: 0.058s Extra space used: 35k (of a 408k module). Signed-off-by: Kevin Cernekee Signed-off-by: Rusty Russell LKML-Reference: <73defb5e4bca04a6431392cc341112b1@localhost> --- kernel/module.c | 65 +++++++++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cf9f1b6b3268..4928cffc3dcc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -138,7 +138,6 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long *strmap; unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; @@ -2178,12 +2177,19 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ static void layout_symtab(struct module *mod, struct load_info *info) { Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst; + unsigned int i, nsrc, ndst, strtab_size; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2194,38 +2200,23 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - /* - * info->strmap has a '1' bit for each byte of .strtab we want to - * keep resident in mod->core_strtab. Everything else in .strtab - * is unreferenced by the symbols in mod->core_symtab, and will be - * discarded when add_kallsyms() compacts the string table. - */ - for (ndst = i = 1; i < nsrc; ++i, ++src) + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - unsigned int j = src->st_name; - - while (!__test_and_set_bit(j, info->strmap) - && info->strtab[j]) - ++j; - ++ndst; + strtab_size += strlen(&info->strtab[src->st_name]) + 1; + ndst++; } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_size += strtab_size; /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); - - /* Append room for core symbols' strings at end of core part. */ - info->stroffs = mod->core_size; - - /* First strtab byte (and first symtab entry) are zeroes. */ - __set_bit(0, info->strmap); - mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2246,22 +2237,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); mod->core_symtab = dst = mod->module_core + info->symoffs; + mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; *dst = *src; + *s++ = 0; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; + dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(info->strmap, - dst[ndst].st_name); - ++ndst; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; } mod->core_num_syms = ndst; - - mod->core_strtab = s = mod->module_core + info->stroffs; - for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) - if (test_bit(i, info->strmap)) - *++s = mod->strtab[i]; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2751,27 +2739,18 @@ static struct module *layout_and_allocate(struct load_info *info) this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, info); - - info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) - * sizeof(long), GFP_KERNEL); - if (!info->strmap) { - err = -ENOMEM; - goto free_percpu; - } layout_symtab(mod, info); /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_strmap; + goto free_percpu; /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; -free_strmap: - kfree(info->strmap); free_percpu: percpu_modfree(mod); out: @@ -2781,7 +2760,6 @@ out: /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { - kfree(info->strmap); percpu_modfree(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); @@ -2911,8 +2889,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto unlink; - /* Get rid of temporary copy and strmap. */ - kfree(info.strmap); + /* Get rid of temporary copy. */ free_copy(&info); /* Done! */ -- cgit v1.2.3 From bd77c04772da38fca510c81f78e51f727123b919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Jan 2012 09:32:14 +1030 Subject: module: struct module_ref should contains long fields module_ref contains two "unsigned int" fields. Thats now too small, since some machines can open more than 2^32 files. Check commit 518de9b39e8 (fs: allow for more than 2^31 files) for reference. We can add an aligned(2 * sizeof(unsigned long)) attribute to force alloc_percpu() allocating module_ref areas in single cache lines. Signed-off-by: Eric Dumazet CC: Rusty Russell CC: Tejun Heo CC: Robin Holt CC: David Miller Signed-off-by: Rusty Russell --- kernel/debug/kdb/kdb_main.c | 2 +- kernel/module.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3cd..e2ae7349437f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); + kdb_printf("%4ld ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/module.c b/kernel/module.c index 4928cffc3dcc..14b8e82e05d4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -725,9 +725,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) } } -unsigned int module_refcount(struct module *mod) +unsigned long module_refcount(struct module *mod) { - unsigned int incs = 0, decs = 0; + unsigned long incs = 0, decs = 0; int cpu; for_each_possible_cpu(cpu) @@ -853,7 +853,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %u ", module_refcount(mod)); + seq_printf(m, " %lu ", module_refcount(mod)); /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ @@ -903,7 +903,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", module_refcount(mk->mod)); + return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } static struct module_attribute refcnt = { -- cgit v1.2.3 From 5e12416927975aa3c58394cea15db6c3e488a033 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 6 Dec 2011 12:11:31 -0700 Subject: module: replace DEBUGP with pr_debug Use more flexible pr_debug. This allows: echo "module module +p" > /dbg/dynamic_debug/control to turn on debug messages when needed. Signed-off-by: Jim Cromie Signed-off-by: Rusty Russell --- kernel/module.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 14b8e82e05d4..b02d6335f8a6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -62,12 +62,6 @@ #define CREATE_TRACE_POINTS #include -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt , a...) -#endif - #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 #endif @@ -409,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, return fsa.sym; } - DEBUGP("Failed to find symbol %s\n", name); + pr_debug("Failed to find symbol %s\n", name); return NULL; } EXPORT_SYMBOL_GPL(find_symbol); @@ -599,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) list_for_each_entry(use, &b->source_list, source_list) { if (use->source == a) { - DEBUGP("%s uses %s!\n", a->name, b->name); + pr_debug("%s uses %s!\n", a->name, b->name); return 1; } } - DEBUGP("%s does not use %s!\n", a->name, b->name); + pr_debug("%s does not use %s!\n", a->name, b->name); return 0; } @@ -618,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - DEBUGP("Allocating new usage for %s.\n", a->name); + pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { printk(KERN_WARNING "%s: out of memory loading\n", a->name); @@ -662,7 +656,7 @@ static void module_unload_free(struct module *mod) mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; - DEBUGP("%s unusing %s\n", mod->name, i->name); + pr_debug("%s unusing %s\n", mod->name, i->name); module_put(i); list_del(&use->source_list); list_del(&use->target_list); @@ -760,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { - DEBUGP("Looking at refcount...\n"); + pr_debug("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); if (module_refcount(mod) == 0) break; @@ -803,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, if (mod->state != MODULE_STATE_LIVE) { /* FIXME: if (force), slam module count and wake up waiter --RR */ - DEBUGP("%s already dying\n", mod->name); + pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; } @@ -1056,7 +1050,7 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; - DEBUGP("Found checksum %lX vs module %lX\n", + pr_debug("Found checksum %lX vs module %lX\n", maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -1833,7 +1827,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", name); + pr_debug("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1841,7 +1835,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_ABS: /* Don't need to do anything */ - DEBUGP("Absolute symbol: 0x%08lx\n", + pr_debug("Absolute symbol: 0x%08lx\n", (long)sym[i].st_value); break; @@ -1965,7 +1959,7 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) info->sechdrs[i].sh_entsize = ~0UL; - DEBUGP("Core section allocation order:\n"); + pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -1977,7 +1971,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", name); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -1994,7 +1988,7 @@ static void layout_sections(struct module *mod, struct load_info *info) } } - DEBUGP("Init section allocation order:\n"); + pr_debug("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; @@ -2007,7 +2001,7 @@ static void layout_sections(struct module *mod, struct load_info *info) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", sname); + pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ @@ -2195,7 +2189,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, info->index.sym) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); @@ -2216,7 +2210,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } static void add_kallsyms(struct module *mod, const struct load_info *info) @@ -2618,7 +2612,7 @@ static int move_module(struct module *mod, struct load_info *info) mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ - DEBUGP("final section addresses:\n"); + pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; @@ -2636,8 +2630,8 @@ static int move_module(struct module *mod, struct load_info *info) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ shdr->sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", - shdr->sh_addr, info->secstrings + shdr->sh_name); + pr_debug("\t0x%lx %s\n", + (long)shdr->sh_addr, info->secstrings + shdr->sh_name); } return 0; @@ -2798,7 +2792,7 @@ static struct module *load_module(void __user *umod, struct module *mod; long err; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); /* Copy in the blobs from userspace, check they are vaguely sane. */ -- cgit v1.2.3 From 8487bfd954928660a52e91384a9b1f1049217e35 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 6 Dec 2011 12:11:31 -0700 Subject: kernel/params: replace DEBUGP with pr_debug Use more flexible pr_debug. This allows: echo "module params +p" > /dbg/dynamic_debug/control to turn on debug messages when needed. Signed-off-by: Jim Cromie Signed-off-by: Rusty Russell --- kernel/params.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 65aae11eb93f..9240664af110 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -25,12 +25,6 @@ #include #include -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt, a...) -#endif - /* Protects all parameters, and incidentally kmalloced_param list. */ static DEFINE_MUTEX(param_lock); @@ -105,7 +99,7 @@ static int parse_one(char *param, /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; - DEBUGP("They are equal! Calling %p\n", + pr_debug("They are equal! Calling %p\n", params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); @@ -115,11 +109,11 @@ static int parse_one(char *param, } if (handle_unknown) { - DEBUGP("Unknown argument: calling %p\n", handle_unknown); + pr_debug("Unknown argument: calling %p\n", handle_unknown); return handle_unknown(param, val); } - DEBUGP("Unknown argument `%s'\n", param); + pr_debug("Unknown argument `%s'\n", param); return -ENOENT; } @@ -184,7 +178,7 @@ int parse_args(const char *name, { char *param, *val; - DEBUGP("Parsing ARGS: %s\n", args); + pr_debug("Parsing ARGS: %s\n", args); /* Chew leading spaces */ args = skip_spaces(args); -- cgit v1.2.3 From cca3e707301862ca9b9327e6a732463982f8cd1b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 13 Jan 2012 09:32:15 +1030 Subject: modules: sysfs - export: taint, coresize, initsize Recent tools do not want to use /proc to retrieve module information. A few values are currently missing from sysfs to replace the information available in /proc/modules. This adds /sys/module/*/{coresize,initsize,taint} attributes. TAINT_PROPRIETARY_MODULE (P) and TAINT_OOT_MODULE (O) flags are both always shown now, and do no longer exclude each other, also in /proc/modules. Replace the open-coded sysfs attribute initializers with the __ATTR() macro. Add the new attributes to Documentation/ABI. Cc: Lucas De Marchi Signed-off-by: Kay Sievers Signed-off-by: Rusty Russell --- kernel/module.c | 93 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b02d6335f8a6..acf6ed3ebe81 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -842,6 +842,26 @@ out: return ret; } +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; @@ -900,10 +920,8 @@ static ssize_t show_refcnt(struct module_attribute *mattr, return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); } -static struct module_attribute refcnt = { - .attr = { .name = "refcnt", .mode = 0444 }, - .show = show_refcnt, -}; +static struct module_attribute modinfo_refcnt = + __ATTR(refcnt, 0444, show_refcnt, NULL); void module_put(struct module *module) { @@ -963,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, return sprintf(buffer, "%s\n", state); } -static struct module_attribute initstate = { - .attr = { .name = "initstate", .mode = 0444 }, - .show = show_initstate, -}; +static struct module_attribute modinfo_initstate = + __ATTR(initstate, 0444, show_initstate, NULL); static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, @@ -979,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, return count; } -struct module_attribute module_uevent = { - .attr = { .name = "uevent", .mode = 0200 }, - .store = store_uevent, -}; +struct module_attribute module_uevent = + __ATTR(uevent, 0200, NULL, store_uevent); + +static ssize_t show_coresize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->core_size); +} + +static struct module_attribute modinfo_coresize = + __ATTR(coresize, 0444, show_coresize, NULL); + +static ssize_t show_initsize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->init_size); +} + +static struct module_attribute modinfo_initsize = + __ATTR(initsize, 0444, show_initsize, NULL); + +static ssize_t show_taint(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + size_t l; + + l = module_flags_taint(mk->mod, buffer); + buffer[l++] = '\n'; + return l; +} + +static struct module_attribute modinfo_taint = + __ATTR(taint, 0444, show_taint, NULL); static struct module_attribute *modinfo_attrs[] = { + &module_uevent, &modinfo_version, &modinfo_srcversion, - &initstate, - &module_uevent, + &modinfo_initstate, + &modinfo_coresize, + &modinfo_initsize, + &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD - &refcnt, + &modinfo_refcnt, #endif NULL, }; @@ -3236,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[bx++] = 'P'; - else if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[bx++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[bx++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[bx++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - + bx += module_flags_taint(mod, buf + bx); /* Show a - for module-is-being-unloaded */ if (mod->state == MODULE_STATE_GOING) buf[bx++] = '-'; -- cgit v1.2.3 From 69116f279a9eaf4c540934269342d9149538fc79 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:17 +1030 Subject: module_param: avoid bool abuse, add bint for special cases. For historical reasons, we allow module_param(bool) to take an int (or an unsigned int). That's going away. A few drivers really want an int: they set it to -1 and a parameter will set it to 0 or 1. This sucks: reading them from sysfs will give 'Y' for both -1 and 1, but if we change it to an int, then the users might be broken (if they did "param" instead of "param=1"). Use a new 'bint' parser for them. (ntfs has a different problem: it needs an int for debug_msgs because it's also exposed via sysctl.) Cc: Steve Glendinning Cc: Jean Delvare Cc: Guenter Roeck Cc: Hoang-Nam Nguyen Cc: Christoph Raisch Cc: Roland Dreier Cc: Sean Hefty Cc: Hal Rosenstock Cc: linux390@de.ibm.com Cc: Anton Altaparmakov Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: lm-sensors@lm-sensors.org Cc: linux-rdma@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-ntfs-dev@lists.sourceforge.net Cc: alsa-devel@alsa-project.org Acked-by: Takashi Iwai (For the sound part) Acked-by: Guenter Roeck (For the hwmon driver) Signed-off-by: Rusty Russell --- kernel/params.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 9240664af110..32ee04308285 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -363,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = { }; EXPORT_SYMBOL(param_ops_invbool); +int param_set_bint(const char *val, const struct kernel_param *kp) +{ + struct kernel_param boolkp; + bool v; + int ret; + + /* Match bool exactly, by re-using it. */ + boolkp = *kp; + boolkp.arg = &v; + boolkp.flags |= KPARAM_ISBOOL; + + ret = param_set_bool(val, &boolkp); + if (ret == 0) + *(int *)kp->arg = v; + return ret; +} +EXPORT_SYMBOL(param_set_bint); + +struct kernel_param_ops param_ops_bint = { + .set = param_set_bint, + .get = param_get_int, +}; +EXPORT_SYMBOL(param_ops_bint); + /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, -- cgit v1.2.3 From 29d4d6df107b9d86982dc759f5b1ddfe2c6b29c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:17 +1030 Subject: printk: fix unnecessary module_param_name. You don't need module_param_name if the name is the same! Cc: Yanmin Zhang Cc: Andrew Morton Signed-off-by: Rusty Russell --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 989e4a52da76..63b3bc31fe32 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str) } early_param("ignore_loglevel", ignore_loglevel_setup); -module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" "print all kernel messages to the console."); -- cgit v1.2.3 From 6d6a55ec0877393f467067d44b9a2a8c2e4a82d2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: kernel/async: remove redundant declaration. It's in linux/init.h, and I'm about to change it to a bool. Cc: Arjan van de Ven Signed-off-by: Rusty Russell --- kernel/async.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 80b74b88fefe..bd0c168a3bbe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; -extern int initcall_debug; - /* * MUST be called with the lock held! -- cgit v1.2.3 From 2329abfa344a9a824bc4c71f2415528777265510 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: module_param: make bool parameters really bool (core code) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. Signed-off-by: Rusty Russell --- kernel/irq/internals.h | 2 +- kernel/irq/spurious.c | 2 +- kernel/printk.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a73dd6c7372d..b7952316016a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,7 +15,7 @@ #define istate core_internal_state__do_not_mess_with_it -extern int noirqdebug; +extern bool noirqdebug; /* * Bits used by threaded handlers: diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dc813a948be2..611cd6003c45 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->irqs_unhandled = 0; } -int noirqdebug __read_mostly; +bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { diff --git a/kernel/printk.c b/kernel/printk.c index 63b3bc31fe32..13c0a1143f49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end) } } -static int __read_mostly ignore_loglevel; +static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { @@ -696,9 +696,9 @@ static void zap_locks(void) } #if defined(CONFIG_PRINTK_TIME) -static int printk_time = 1; +static bool printk_time = 1; #else -static int printk_time = 0; +static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); @@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -int console_suspend_enabled = 1; +bool console_suspend_enabled = 1; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) -- cgit v1.2.3 From efeb156e7275c5b6c6e0f96aceb3c6abf98fc392 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 12 Jan 2012 17:17:11 -0800 Subject: kprobes: silence DEBUG_STRICT_USER_COPY_CHECKS=y warning Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning: In file included from arch/x86/include/asm/uaccess.h:573, from kernel/kprobes.c:55: In function 'copy_from_user', inlined from 'write_enabled_file_bool' at kernel/kprobes.c:2191: arch/x86/include/asm/uaccess_64.h:65: warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct presumably due to buf_size being signed causing GCC to fail to see that buf_size can't become negative. Signed-off-by: Stephen Boyd Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Acked-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) -- cgit v1.2.3 From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:17 -0800 Subject: treewide: remove useless NORET_TYPE macro and uses It's a very old and now unused prototype marking so just delete it. Neaten panic pointer argument style to keep checkpatch quiet. Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- kernel/panic.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -887,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c3..6fd09ed6fd90 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...) { static char buf[1024]; va_list args; -- cgit v1.2.3 From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 12 Jan 2012 17:20:11 -0800 Subject: kexec: remove KMSG_DUMP_KEXEC KMSG_DUMP_KEXEC is useless because we already save kernel messages inside /proc/vmcore, and it is unsafe to allow modules to do other stuffs in a crash dump scenario. [akpm@linux-foundation.org: fix powerpc build] Signed-off-by: WANG Cong Reported-by: Vivek Goyal Acked-by: Vivek Goyal Acked-by: Jarod Wilson Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 090ee10d9604..20ed47ae252f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) if (kexec_crash_image) { struct pt_regs fixed_regs; - kmsg_dump(KMSG_DUMP_KEXEC); - crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); -- cgit v1.2.3 From 6480e5a0923756b500634d9777ec4189492fbbfe Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:14 -0800 Subject: kdump: add missing RAM resource in crash_shrink_memory() When shrinking crashkernel memory using /sys/kernel/kexec_crash_size for the newly added memory no RAM resource is created at the moment. Example: $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : Crash kernel d0000000-ffffffff : System RAM $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss <<-- here is System RAM missing d0000000-ffffffff : System RAM One result of this bug is that the memory chunk can never be set offline using memory hotplug. With this patch I insert a new "System RAM" resource for the released memory. Then the upper example looks like the following: $ echo 0 > /sys/kernel/kexec_crash_size $ cat /proc/iomem 00000000-bfffffff : System RAM 00000000-005b7ac3 : Kernel code 005b7ac4-009743bf : Kernel data 009bb000-00a85c33 : Kernel bss c0000000-cfffffff : System RAM <<-- new rescoure d0000000-ffffffff : System RAM And now I can set chunk c0000000-cfffffff offline. Signed-off-by: Michael Holzheu Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 20ed47ae252f..60bf181b3eae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1146,6 +1147,12 @@ int crash_shrink_memory(unsigned long new_size) goto unlock; } + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); @@ -1154,7 +1161,15 @@ int crash_shrink_memory(unsigned long new_size) if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); crash_unmap_reserved_pages(); unlock: -- cgit v1.2.3 From bec013c40bc89671d8d457944fdf7d2b8e79d651 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:15 -0800 Subject: kdump: crashk_res init check for /sys/kernel/kexec_crash_size Currently it is possible to set the crash_size via the sysfs /sys/kernel/kexec_crash_size even if no crash kernel memory has been defined with the "crashkernel" parameter. In this case "crashk_res" is not initialized and crashk_res.start = crashk_res.end = 0. Unfortunately resource_size(&crashk_res) returns 1 in this case. This breaks the s390 implementation of crash_(un)map_reserved_pages(). To fix the problem the correct "old_size" is now calculated in crash_shrink_memory(). "old_size is set to "0" if crashk_res is not initialized. With this change crash_shrink_memory() will do nothing, when "crashk_res" is not initialized. It will return "0" for "echo 0 > /sys/kernel/kexec_crash_size" and -EINVAL for "echo [not zero] > /sys/kernel/kexec_crash_size". In addition to that this patch also simplifies the "ret = -EINVAL" vs. "ret = 0" logic as suggested by Simon Horman. Signed-off-by: Michael Holzheu Reviewed-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Cc: Vivek Goyal Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bf181b3eae..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; + unsigned long old_size; struct resource *ram_res; mutex_lock(&kexec_mutex); @@ -1139,11 +1140,9 @@ int crash_shrink_memory(unsigned long new_size) } start = crashk_res.start; end = crashk_res.end; - - if (new_size >= end - start + 1) { - ret = -EINVAL; - if (new_size == end - start + 1) - ret = 0; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } -- cgit v1.2.3 From 93e13a360ba331915220f82f6e9543df961ffa1f Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Thu, 12 Jan 2012 17:20:18 -0800 Subject: kdump: fix crash_kexec()/smp_send_stop() race in panic() When two CPUs call panic at the same time there is a possible race condition that can stop kdump. The first CPU calls crash_kexec() and the second CPU calls smp_send_stop() in panic() before crash_kexec() finished on the first CPU. So the second CPU stops the first CPU and therefore kdump fails: 1st CPU: panic()->crash_kexec()->mutex_trylock(&kexec_mutex)-> do kdump 2nd CPU: panic()->crash_kexec()->kexec_mutex already held by 1st CPU ->smp_send_stop()-> stop 1st CPU (stop kdump) This patch fixes the problem by introducing a spinlock in panic that allows only one CPU to process crash_kexec() and the subsequent panic code. All other CPUs call the weak function panic_smp_self_stop() that stops the CPU itself. This function can be overloaded by architecture code. For example "tile" can use their lower-power "nap" instruction for that. Signed-off-by: Michael Holzheu Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6fd09ed6fd90..5dce5404eeef 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state) long (*panic_blink)(int state); EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ + while (1) + cpu_relax(); +} + /** * panic - halt the system * @fmt: The text string to print @@ -59,6 +68,7 @@ EXPORT_SYMBOL(panic_blink); */ void panic(const char *fmt, ...) { + static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; @@ -68,8 +78,14 @@ void panic(const char *fmt, ...) * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... + * + * Only one CPU is allowed to execute the panic code from here. For + * multiple parallel invocations of panic, all other CPUs either + * stop themself or will wait until they are stopped by the 1st CPU + * with smp_send_stop(). */ - preempt_disable(); + if (!spin_trylock(&panic_lock)) + panic_smp_self_stop(); console_verbose(); bust_spinlocks(1); -- cgit v1.2.3 From b8f566b04d3cddd192cfd2418ae6d54ac6353792 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 12 Jan 2012 17:20:27 -0800 Subject: sysctl: add the kernel.ns_last_pid control The sysctl works on the current task's pid namespace, getting and setting its last_pid field. Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible to create a task with desired pid value. This ability is required badly for the checkpoint/restore in userspace. This approach suits all the parties for now. Signed-off-by: Pavel Emelyanov Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 4 +++- kernel/pid_namespace.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) } /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it). * We want the winner to have the "later" value, because if the * "earlier" value prevails, then a pid may get reused immediately. * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Writing directly to ns' last_pid field is OK, since this field + * is volatile in a living namespace anyway and a code writing to + * it should synchronize its usage with external means. + */ + + tmp.data = ¤t->nsproxy->pid_ns->last_pid; + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { + { + .procname = "ns_last_pid", + .maxlen = sizeof(int), + .mode = 0666, /* permissions are checked in the handler */ + .proc_handler = pid_ns_ctl_handler, + }, + { } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + register_sysctl_paths(kern_path, pid_ns_ctl_table); return 0; } -- cgit v1.2.3 From 6e6f0a1f0fa6bba1493c296eb30d1e176e1f8530 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:30 -0800 Subject: panic: don't print redundant backtraces on oops When an oops causes a panic and panic prints another backtrace it's pretty common to have the original oops data be scrolled away on a 80x50 screen. The second backtrace is quite redundant and not needed anyways. So don't print the panic backtrace when oops_in_progress is true. [akpm@linux-foundation.org: add comment] Signed-off-by: Andi Kleen Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 5dce5404eeef..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -94,7 +94,11 @@ void panic(const char *fmt, ...) va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); #ifdef CONFIG_DEBUG_BUGVERBOSE - dump_stack(); + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!oops_in_progress) + dump_stack(); #endif /* -- cgit v1.2.3 From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:55 -0800 Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries When we restore a task we need to set up text, data and data heap sizes from userspace to the values a task had at checkpoint time. This patch adds auxilary prctl codes for that. While most of them have a statistical nature (their values are involved into calculation of /proc//statm output) the start_brk and brk values are used to compute an allowed size of program data segment expansion. Which means an arbitrary changes of this values might be dangerous operation. So to restrict access the following requirements applied to prctl calls: - The process has to have CAP_SYS_ADMIN capability granted. - For all opcodes except start_brk/brk members an appropriate VMA area must exist and should fit certain VMA flags, such as: - code segment must be executable but not writable; - data segment must not be executable. start_brk/brk values must not intersect with data segment and must not exceed RLIMIT_DATA resource limit. Still the main guard is CAP_SYS_ADMIN capability check. Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support otherwise these prctl calls will return -EINVAL. [akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text] Signed-off-by: Cyrill Gorcunov Reviewed-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f8..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + unsigned long rlim = rlimit(RLIMIT_DATA); + unsigned long vm_req_flags; + unsigned long vm_bad_flags; + struct vm_area_struct *vma; + int error = 0; + struct mm_struct *mm = current->mm; + + if (arg4 | arg5) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (addr >= TASK_SIZE) + return -EINVAL; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + + if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { + /* It must be existing VMA */ + if (!vma || vma->vm_start > addr) + goto out; + } + + error = -EINVAL; + switch (opt) { + case PR_SET_MM_START_CODE: + case PR_SET_MM_END_CODE: + vm_req_flags = VM_READ | VM_EXEC; + vm_bad_flags = VM_WRITE | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_CODE) + mm->start_code = addr; + else + mm->end_code = addr; + break; + + case PR_SET_MM_START_DATA: + case PR_SET_MM_END_DATA: + vm_req_flags = VM_READ | VM_WRITE; + vm_bad_flags = VM_EXEC | VM_MAYSHARE; + + if ((vma->vm_flags & vm_req_flags) != vm_req_flags || + (vma->vm_flags & vm_bad_flags)) + goto out; + + if (opt == PR_SET_MM_START_DATA) + mm->start_data = addr; + else + mm->end_data = addr; + break; + + case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else + vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif + if ((vma->vm_flags & vm_req_flags) != vm_req_flags) + goto out; + + mm->start_stack = addr; + break; + + case PR_SET_MM_START_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (mm->brk - addr) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->start_brk = addr; + break; + + case PR_SET_MM_BRK: + if (addr <= mm->end_data) + goto out; + + if (rlim < RLIM_INFINITY && + (addr - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) + goto out; + + mm->brk = addr; + break; + + default: + error = -EINVAL; + goto out; + } + + error = 0; + +out: + up_read(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, + unsigned long arg4, unsigned long arg5) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else error = PR_MCE_KILL_DEFAULT; break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From dae5cbc2440b1d21a15715d0f1fb20f632dd38ee Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 14 Jan 2012 00:33:03 +0100 Subject: PM: Make sysrq-o be available for CONFIG_PM unset After commit 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70, "PM: Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME)", the files under kernel/power are not built unless CONFIG_PM_SLEEP or CONFIG_PM_RUNTIME is set. In particular, this causes kernel/power/poweroff.c to be omitted, even though it should be compiled, because CONFIG_MAGIC_SYSRQ is set. Fix the problem by causing kernel/power/Makefile to be processed for CONFIG_PM unset too. Reported-and-tested-by: Phil Oester Signed-off-by: Rafael J. Wysocki --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f70396e5a24b..2d9de86b7e76 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o @@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o -obj-$(CONFIG_PM) += power/ -obj-$(CONFIG_FREEZER) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o -- cgit v1.2.3 From ee34a37049114303011e154478c63b977bcff24c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Jan 2012 12:56:23 +0800 Subject: PM / Hibernate: Drop the check of swap space size for compressed image For compressed image, the space required is not known until we finish compressing and writing all pages. This patch drops the check, and if swap space is not enough finally, system can still restore to normal after writing swap fails for compressed images. Signed-off-by: Barry Song Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3739ecced085..8742fd013a94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) pr_debug("PM: Free swap pages: %u\n", free_swap); - required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? - nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + required = PAGES_FOR_IO + nr_pages; return free_swap > required; } @@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out_finish; + if (flags & SF_NOCOMPRESS_MODE) { + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); -- cgit v1.2.3 From 53999bf34d55981328f8ba9def558d3e104d6e36 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Sun, 15 Jan 2012 19:32:55 -0400 Subject: error: implicit declaration of function 'module_flags_taint' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent changes to kernel/module.c caused the following compile error: kernel/module.c: In function ‘show_taint’: kernel/module.c:1024:2: error: implicit declaration of function ‘module_flags_taint’ [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Correct this error by moving the definition of module_flags_taint outside of the #ifdef CONFIG_MODULE_UNLOAD section. Signed-off-by: Kevin Winchester Signed-off-by: Linus Torvalds --- kernel/module.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index acf6ed3ebe81..2c932760fd33 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -842,26 +842,6 @@ out: return ret; } -static size_t module_flags_taint(struct module *mod, char *buf) -{ - size_t l = 0; - - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ - return l; -} - static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; @@ -962,6 +942,26 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ +static size_t module_flags_taint(struct module *mod, char *buf) +{ + size_t l = 0; + + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[l++] = 'P'; + if (mod->taints & (1 << TAINT_OOT_MODULE)) + buf[l++] = 'O'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[l++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[l++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + return l; +} + static ssize_t show_initstate(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { -- cgit v1.2.3 From 951880e634a79884236a575b896abf55c39ae0bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Jan 2012 10:19:41 -0800 Subject: Revert "capabitlies: ns_capable can use the cap helpers rather than lsm call" This reverts commit d2a7009f0bb03fa22ad08dd25472efa0568126b9. J. R. Okajima explains: "After this commit, I am afraid access(2) on NFS may not work correctly. The scenario based upon my guess. - access(2) overrides the credentials. - calls inode_permission() -- ... -- generic_permission() -- ns_capable(). - while the old ns_capable() calls security_capable(current_cred()), the new ns_capable() calls has_ns_capability(current) -- security_capable(__task_cred(t)). current_cred() returns current->cred which is effective (overridden) credentials, but __task_cred(current) returns current->real_cred (the NFSD's credential). And the overridden credentials by access(2) lost." Requested-by: J. R. Okajima Acked-by: Eric Paris Signed-off-by: Linus Torvalds --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 0fcf1c14a297..3f1adb6c6470 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (has_ns_capability(current, ns, cap)) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } -- cgit v1.2.3 From 5ef30ee53b187786e64bdc1f8109e39d17f2ce58 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: make filetype matching consistent with other filters Every other filter that matches part of the inodes list collected by audit will match against any of the inodes on that list. The filetype matching however had a strange way of doing things. It allowed userspace to indicated if it should match on the first of the second name collected by the kernel. Name collection ordering seems like a kernel internal and making userspace rules get that right just seems like a bad idea. As it turns out the userspace audit writers had no idea it was doing this and thus never overloaded the value field. The kernel always checked the first name collected which for the tested rules was always correct. This patch just makes the filetype matching like the major, minor, inode, and LSM rules in that it will match against any of the names collected. It also changes the rule validation to reject the old unused rule types. Noone knew it was there. Noone used it. Why keep around the extra code? Signed-off-by: Eric Paris --- kernel/auditfilter.c | 4 ++-- kernel/auditsc.c | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d678..d94dde82c3c8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -385,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -536,7 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; default: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b3..a09c50317059 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -305,21 +305,20 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int which) +static int audit_match_filetype(struct audit_context *ctx, int val) { - unsigned index = which & ~S_IFMT; - umode_t mode = which & S_IFMT; + int index; + umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; + for (index = 0; index < ctx->name_count; index++) { + if ((ctx->names[index].ino != -1) && + ((ctx->names[index].mode & S_IFMT) == mode)) + return 1; + } + return 0; } /* -- cgit v1.2.3 From 5195d8e217a78697152d64fc09a16e063a022465 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: dynamically allocate audit_names when not enough space is in the names array This patch does 2 things. First it reduces the number of audit_names allocated in every audit context from 20 to 5. 5 should be enough for all 'normal' syscalls (rename being the worst). Some syscalls can still touch more the 5 inodes such as mount. When rpc filesystem is mounted it will create inodes and those can exceed 5. To handle that problem this patch will dynamically allocate audit_names if it needs more than 5. This should decrease the typicall memory usage while still supporting all the possible kernel operations. Signed-off-by: Eric Paris --- kernel/auditsc.c | 403 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 215 insertions(+), 188 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a09c50317059..1a92d61ddd27 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -71,8 +71,9 @@ #include "audit.h" /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -101,9 +102,8 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { + struct list_head list; /* audit_context->names_list */ const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -113,6 +113,14 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + bool name_put; /* call __putname() for this name */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit + */ + bool should_free; }; struct audit_aux_data { @@ -174,8 +182,17 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - int name_count; - struct audit_names names[AUDIT_NAMES]; + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -307,17 +324,18 @@ static int audit_match_perm(struct audit_context *ctx, int mask) static int audit_match_filetype(struct audit_context *ctx, int val) { - int index; + struct audit_names *n; umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - for (index = 0; index < ctx->name_count; index++) { - if ((ctx->names[index].ino != -1) && - ((ctx->names[index].mode & S_IFMT) == mode)) + list_for_each_entry(n, &ctx->names_list, list) { + if ((n->ino != -1) && + ((n->mode & S_IFMT) == mode)) return 1; } + return 0; } @@ -456,13 +474,14 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, j, need_sid = 1; + int i, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + struct audit_names *n; int result = 0; switch (f->type) { @@ -525,8 +544,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(MAJOR(name->dev), f->op, f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val)) { ++result; break; } @@ -538,8 +557,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(MINOR(name->dev), f->op, f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MINOR(n->dev), f->op, f->val)) { ++result; break; } @@ -550,8 +569,8 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->ino, f->op, f->val)) { ++result; break; } @@ -606,11 +625,10 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (security_audit_rule_match(n->osid, f->type, + f->op, f->lsm_rule, + ctx)) { ++result; break; } @@ -721,40 +739,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* At syscall exit time, this filter is called if any audit_names[] have been +/* + * Given an audit_name check the inode hash table to see if they match. + * Called holding the rcu read lock to protect the use of audit_inode_hash + */ +static int audit_filter_inode_name(struct task_struct *tsk, + struct audit_names *n, + struct audit_context *ctx) { + int word, bit; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + struct audit_entry *e; + enum audit_state state; + + word = AUDIT_WORD(ctx->major); + bit = AUDIT_BIT(ctx->major); + + if (list_empty(list)) + return 0; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { + ctx->current_state = state; + return 1; + } + } + + return 0; +} + +/* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. + * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - int i; - struct audit_entry *e; - enum audit_state state; + struct audit_names *n; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - if (list_empty(list)) - continue; - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_filter_inode_name(tsk, n, ctx)) + break; } rcu_read_unlock(); } @@ -798,7 +829,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - int i; + struct audit_names *n, *next; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -809,10 +840,9 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) { + list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); + n->name, n->name ?: "(null)"); } dump_stack(); return; @@ -823,9 +853,12 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); + list_for_each_entry_safe(n, next, &context->names_list, list) { + list_del(&n->list); + if (n->name && n->name_put) + __putname(n->name); + if (n->should_free) + kfree(n); } context->name_count = 0; path_put(&context->pwd); @@ -863,6 +896,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); + INIT_LIST_HEAD(&context->names_list); return context; } @@ -1323,6 +1357,68 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static void audit_log_name(struct audit_context *context, struct audit_names *n, + int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", record_num); + + if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, "name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1330,6 +1426,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; + struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1469,66 +1566,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_end(ab); } } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); - } + i = 0; + list_for_each_entry(n, &context->names_list, list) + audit_log_name(context, n, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1820,6 +1861,30 @@ retry: #endif } +static struct audit_names *audit_alloc_name(struct audit_context *context) +{ + struct audit_names *aname; + + if (context->name_count < AUDIT_NAMES) { + aname = &context->preallocated_names[context->name_count]; + memset(aname, 0, sizeof(*aname)); + } else { + aname = kzalloc(sizeof(*aname), GFP_NOFS); + if (!aname) + return NULL; + aname->should_free = true; + } + + aname->ino = (unsigned long)-1; + list_add_tail(&aname->list, &context->names_list); + + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return aname; +} + /** * audit_getname - add a name to the list * @name: name to add @@ -1830,6 +1895,7 @@ retry: void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; + struct audit_names *n; if (IS_ERR(name) || !name) return; @@ -1842,13 +1908,15 @@ void __audit_getname(const char *name) #endif return; } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; + + n = audit_alloc_name(context); + if (!n) + return; + + n->name = name; + n->name_len = AUDIT_NAME_FULL; + n->name_put = true; + if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -1870,12 +1938,13 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { + struct audit_names *n; int i; - for (i = 0; i < context->name_count; i++) + + list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } + n->name, n->name ?: "(null)"); + } #endif __putname(name); } @@ -1896,39 +1965,11 @@ void audit_putname(const char *name) #endif } -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - - static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; - memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); - memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); - name->fcap.fE = 0; - name->fcap_ver = 0; - if (!dentry) return 0; @@ -1968,30 +2009,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { - int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; + struct audit_names *n; if (!context->in_syscall) return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; + + list_for_each_entry_reverse(n, &context->names_list, list) { + if (n->name && (n->name == name)) + goto out; } + + /* unable to find the name from a previous getname() */ + n = audit_alloc_name(context); + if (!n) + return; +out: handle_path(dentry); - audit_copy_inode(&context->names[idx], dentry, inode); + audit_copy_inode(n, dentry, inode); } /** @@ -2010,11 +2046,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { - int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; + struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2024,9 +2060,7 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2039,9 +2073,7 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2059,34 +2091,29 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - if (audit_inc_name_count(context, parent)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], NULL, parent); + audit_copy_inode(n, NULL, parent); } if (!found_child) { - if (audit_inc_name_count(context, inode)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; + n->name = found_parent; + n->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; + n->name_put = false; } if (inode) - audit_copy_inode(&context->names[idx], NULL, inode); - else - context->names[idx].ino = (unsigned long)-1; + audit_copy_inode(n, NULL, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); -- cgit v1.2.3 From 3035c51e8ac0512686ceb9f2bd1d13bdc6e4fb29 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: drop the meaningless and format breaking word 'user' userspace audit messages look like so: type=USER msg=audit(1271170549.415:24710): user pid=14722 uid=0 auid=500 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 msg='' That third field just says 'user'. That's useless and doesn't follow the key=value pair we are trying to enforce. We already know it came from the user based on the record type. Kill that word. Die. Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2c1d6ab7106e..00efe4758c86 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); -- cgit v1.2.3 From 16c174bd95cb07c9d0ad3fcd8c70f9cea7214c9d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: audit: check current inode and containing object when filtering on major and minor The audit system has the ability to filter on the major and minor number of the device containing the inode being operated upon. Lets say that /dev/sda1 has major,minor 8,1 and that we mount /dev/sda1 on /boot. Now lets say we add a watch with a filter on 8,1. If we proceed to open an inode inside /boot, such as /vboot/vmlinuz, we will match the major,minor filter. Lets instead assume that one were to use a tool like debugfs and were to open /dev/sda1 directly and to modify it's contents. We might hope that this would also be logged, but it isn't. The rules will check the major,minor of the device containing /dev/sda1. In other words the rule would match on the major/minor of the tmpfs mounted at /dev. I believe these rules should trigger on either device. The man page is devoid of useful information about the intended semantics. It only seems logical that if you want to know everything that happened on a major,minor that would include things that happened to the device itself... Signed-off-by: Eric Paris --- kernel/auditsc.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1a92d61ddd27..7c495147c3d9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -540,12 +540,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { + if (name) { + if (audit_comparator(MAJOR(name->dev), f->op, f->val) || + audit_comparator(MAJOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MAJOR(n->dev), f->op, f->val)) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val) || + audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -553,12 +555,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { + if (name) { + if (audit_comparator(MINOR(name->dev), f->op, f->val) || + audit_comparator(MINOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(MINOR(n->dev), f->op, f->val)) { + if (audit_comparator(MINOR(n->dev), f->op, f->val) || + audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } -- cgit v1.2.3 From 85e7bac33b8d5edafc4e219c7dfdb3d48e0b4e31 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:05 -0500 Subject: seccomp: audit abnormal end to a process due to seccomp The audit system likes to collect information about processes that end abnormally (SIGSEGV) as this may me useful intrusion detection information. This patch adds audit support to collect information when seccomp forces a task to exit because of misbehavior in a similar way. Signed-off-by: Eric Paris --- kernel/auditsc.c | 50 +++++++++++++++++++++++++++++--------------------- kernel/seccomp.c | 2 ++ 2 files changed, 31 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7c495147c3d9..e9bcb93800d8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2529,6 +2529,25 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + uid_t auid, uid; + gid_t gid; + unsigned int sessionid; + + auid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + current_uid_gid(&uid, &gid); + + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + audit_log_task_context(ab); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " reason="); + audit_log_string(ab, reason); + audit_log_format(ab, " sig=%ld", signr); +} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2539,10 +2558,6 @@ void __audit_mmap_fd(int fd, int flags) void audit_core_dumps(long signr) { struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current), uid; - gid_t gid; - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2551,24 +2566,17 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - current_uid_gid(&uid, &gid); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; + audit_log_abend(ab, "memory violation", signr); + audit_log_end(ab); +} - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); +void __audit_seccomp(unsigned long syscall) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_format(ab, " syscall=%ld", syscall); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631d..e8d76c5895ea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -6,6 +6,7 @@ * This defines a simple but solid secure-computing mode. */ +#include #include #include #include @@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + audit_seccomp(this_syscall); do_exit(SIGKILL); } -- cgit v1.2.3 From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris Acked-by: H. Peter Anvin [for x86 portion] Acked-by: Tony Luck [for ia64] Acked-by: Richard Weinberger [for uml] Acked-by: David S. Miller [for sparc] Acked-by: Ralf Baechle [for mips] Acked-by: Benjamin Herrenschmidt [for ppc] --- kernel/auditsc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e9bcb93800d8..3d2853808185 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,6 +70,11 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child) /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @pt_regs: syscall registers * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from @@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child) * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; + context = audit_get_context(tsk, success, return_code); if (likely(!context)) return; -- cgit v1.2.3 From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: inline audit_syscall_entry to reduce burden on archs Every arch calls: if (unlikely(current->audit_context)) audit_syscall_entry() which requires knowledge about audit (the existance of audit_context) in the arch code. Just do it all in static inline in audit.h so that arch's can remain blissfully ignorant. Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3d2853808185..b408100dd6ef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { -- cgit v1.2.3 From 997f5b6444f4608692ec807fb802fd9767c80e76 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: remove AUDIT_SETUP_CONTEXT as it isn't used Audit contexts have 3 states. Disabled, which doesn't collect anything, build, which collects info but might not emit it, and record, which collects and emits. There is a 4th state, setup, which isn't used. Get rid of it. Signed-off-by: Eric Paris --- kernel/audit.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2c..816766803371 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -36,12 +36,8 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall + * and fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it -- cgit v1.2.3 From 56179a6ec65a56e0279a58e35cb450d38f061b94 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: drop some potentially inadvisable likely notations The audit code makes heavy use of likely() and unlikely() macros, but they don't always make sense. Drop any that seem questionable and let the computer do it's thing. Signed-off-by: Eric Paris --- kernel/auditsc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b408100dd6ef..d7382c2aaa9e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -805,7 +805,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (likely(!context)) + if (!context) return NULL; context->return_valid = return_valid; @@ -928,7 +928,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return 0; if (!(context = audit_alloc_context(state))) { @@ -1599,7 +1599,7 @@ void audit_free(struct task_struct *tsk) struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (likely(!context)) + if (!context) return; /* Check for system calls that do not go through the exit @@ -1640,7 +1640,7 @@ void __audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (unlikely(!context)) + if (!context) return; /* @@ -1697,7 +1697,7 @@ void __audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return; context->serial = 0; @@ -1748,7 +1748,7 @@ void __audit_syscall_exit(int success, long return_code) success = AUDITSC_FAILURE; context = audit_get_context(tsk, success, return_code); - if (likely(!context)) + if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) -- cgit v1.2.3 From 07c49417877f8658a6aa0ad9b4e21e4fd4df11b6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: inline checks for not needing to collect aux records A number of audit hooks make function calls before they determine that auxilary records do not need to be collected. Do those checks as static inlines since the most common case is going to be that records are not needed and we can skip the function call overhead. Signed-off-by: Eric Paris --- kernel/auditsc.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d7382c2aaa9e..e1062f66b01b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2309,14 +2309,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2337,13 +2334,10 @@ int audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void audit_socketcall(int nargs, unsigned long *args) +void __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return; - context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2369,13 +2363,10 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_sockaddr(int len, void *a) +int __audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return 0; - if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) -- cgit v1.2.3 From a4ff8dba7d8ce5ceb43fb27df66292251cc73bdc Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: inline audit_free to simplify the look of generic code make the conditional a static inline instead of doing it in generic code. Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 +- kernel/exit.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e1062f66b01b..7aaeb38b262a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1594,7 +1594,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void audit_free(struct task_struct *tsk) +void __audit_free(struct task_struct *tsk) { struct audit_context *context; diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..88dcbbc446f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -964,8 +964,7 @@ NORET_TYPE void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); -- cgit v1.2.3 From 7ff68e53ece8c175d2951bb8a30b3cce8f9c5579 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: reject entry,always rules We deprecated entry,always rules a long time ago. Reject those rules as invalid. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d94dde82c3c8..903caa269b5c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: + if (rule->action == AUDIT_ALWAYS) + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { -- cgit v1.2.3 From 6422e78de6880c66a82af512d9bd0c85eb62e661 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: remove audit_finish_fork as it can't be called Audit entry,always rules are not allowed and are automatically changed in exit,always rules in userspace. The kernel refuses to load such rules. Thus a task in the middle of a syscall (and thus in audit_finish_fork()) can only be in one of two states: AUDIT_BUILD_CONTEXT or AUDIT_DISABLED. Since the current task cannot be in AUDIT_RECORD_CONTEXT we aren't every going to actually use the code in audit_finish_fork() since it will return without doing anything. Thus drop the code. Signed-off-by: Eric Paris --- kernel/auditsc.c | 20 -------------------- kernel/fork.c | 2 -- 2 files changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7aaeb38b262a..4d8920f5ab88 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1707,26 +1707,6 @@ void __audit_syscall_entry(int arch, int major, context->ppid = 0; } -void audit_finish_fork(struct task_struct *child) -{ - struct audit_context *ctx = current->audit_context; - struct audit_context *p = child->audit_context; - if (!p || !ctx) - return; - if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) - return; - p->arch = ctx->arch; - p->major = ctx->major; - memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); - p->ctime = ctx->ctime; - p->dummy = ctx->dummy; - p->in_syscall = ctx->in_syscall; - p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); - p->ppid = current->pid; - p->prio = ctx->prio; - p->current_state = ctx->current_state; -} - /** * audit_syscall_exit - deallocate audit context after a system call * @pt_regs: syscall registers diff --git a/kernel/fork.c b/kernel/fork.c index 443f5125f11e..c1e5c21f48c1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1525,8 +1525,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - audit_finish_fork(p); - /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that -- cgit v1.2.3 From efaffd6e4417860c67576ac760dd6e8bbd15f006 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: allow matching on obj_uid Allow syscall exit filter matching based on the uid of the owner of an inode used in a syscall. aka: auditctl -a always,exit -S open -F obj_uid=0 -F perm=wa Signed-off-by: Eric Paris --- kernel/auditfilter.c | 1 + kernel/auditsc.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 903caa269b5c..13e997423dcd 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -461,6 +461,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: + case AUDIT_OBJ_UID: break; case AUDIT_ARCH: entry->rule.arch_f = f; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4d8920f5ab88..5cf3ecc01517 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -586,6 +586,18 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_OBJ_UID: + if (name) { + result = audit_comparator(name->uid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->uid, f->op, f->val)) { + ++result; + break; + } + } + } + break; case AUDIT_WATCH: if (name) result = audit_watch_compare(rule->watch, name->ino, name->dev); -- cgit v1.2.3 From 54d3218b31aee5bc9c859ae60fbde933d922448b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: allow audit matching on inode gid Much like the ability to filter audit on the uid of an inode collected, we should be able to filter on the gid of the inode. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 1 + kernel/auditsc.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 13e997423dcd..f10605c787e6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -462,6 +462,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG2: case AUDIT_ARG3: case AUDIT_OBJ_UID: + case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5cf3ecc01517..87b375fb12ff 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -598,6 +598,18 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_OBJ_GID: + if (name) { + result = audit_comparator(name->gid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->gid, f->op, f->val)) { + ++result; + break; + } + } + } + break; case AUDIT_WATCH: if (name) result = audit_watch_compare(rule->watch, name->ino, name->dev); -- cgit v1.2.3 From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: remove task argument to audit_set_loginuid The function always deals with current. Don't expose an option pretending one can use it for something. You can't. Signed-off-by: Eric Paris --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 87b375fb12ff..9d6dd7d869c0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2163,16 +2163,16 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified + * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +int audit_set_loginuid(uid_t loginuid) { + struct task_struct *task = current; unsigned int sessionid = atomic_inc_return(&session_id); struct audit_context *context = task->audit_context; -- cgit v1.2.3 From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: only allow tasks to set their loginuid if it is -1 At the moment we allow tasks to set their loginuid if they have CAP_AUDIT_CONTROL. In reality we want tasks to set the loginuid when they log in and it be impossible to ever reset. We had to make it mutable even after it was once set (with the CAP) because on update and admin might have to restart sshd. Now sshd would get his loginuid and the next user which logged in using ssh would not be able to set his loginuid. Systemd has changed how userspace works and allowed us to make the kernel work the way it should. With systemd users (even admins) are not supposed to restart services directly. The system will restart the service for them. Thus since systemd is going to loginuid==-1, sshd would get -1, and sshd would be allowed to set a new loginuid without special permissions. If an admin in this system were to manually start an sshd he is inserting himself into the system chain of trust and thus, logically, it's his loginuid that should be used! Since we have old systems I make this a Kconfig option. Signed-off-by: Eric Paris --- kernel/auditsc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9d6dd7d869c0..bd084a13c719 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2173,9 +2173,18 @@ static atomic_t session_id = ATOMIC_INIT(0); int audit_set_loginuid(uid_t loginuid) { struct task_struct *task = current; - unsigned int sessionid = atomic_inc_return(&session_id); struct audit_context *context = task->audit_context; + unsigned int sessionid; + +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (task->loginuid != -1) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; -- cgit v1.2.3 From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: do not call audit_getname on error Just a code cleanup really. We don't need to make a function call just for it to return on error. This also makes the VFS function even easier to follow and removes a conditional on a hot path. Signed-off-by: Eric Paris --- kernel/auditsc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bd084a13c719..9161e70a4379 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1913,9 +1913,6 @@ void __audit_getname(const char *name) struct audit_context *context = current->audit_context; struct audit_names *n; - if (IS_ERR(name) || !name) - return; - if (!context->in_syscall) { #if AUDIT_DEBUG == 2 printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", -- cgit v1.2.3 From 02d86a568c6d2d335256864451ac8ce781bc5652 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: allow interfield comparison in audit rules We wish to be able to audit when a uid=500 task accesses a file which is uid=0. Or vice versa. This patch introduces a new audit filter type AUDIT_FIELD_COMPARE which takes as an 'enum' which indicates which fields should be compared. At this point we only define the task->uid vs inode->uid, but other comparisons can be added. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 5 ++++- kernel/auditsc.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f10605c787e6..a6c3f1abd206 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -526,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: - err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -543,6 +542,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (f->val & ~S_IFMT) goto exit_free; break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) + goto exit_free; + break; default: goto exit_free; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9161e70a4379..8fb2c8e6d624 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -463,6 +463,32 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_field_compare(struct task_struct *tsk, + const struct cred *cred, + struct audit_field *f, + struct audit_context *ctx, + struct audit_names *name) +{ + struct audit_names *n; + + switch (f->val) { + case AUDIT_COMPARE_UID_TO_OBJ_UID: + if (name) { + return audit_comparator(cred->uid, f->op, name->uid); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(cred->uid, f->op, n->uid)) + return 1; + } + } + break; + default: + WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); + return 0; + } + return 0; +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -693,8 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; + case AUDIT_FIELD_COMPARE: + result = audit_field_compare(tsk, cred, f, ctx, name); + break; } - if (!result) return 0; } -- cgit v1.2.3 From b34b039324bf081554ee8678f9b8c5d937e5206c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: complex interfield comparison helper Rather than code the same loop over and over implement a helper function which uses some pointer magic to make it generic enough to be used numerous places as we implement more audit interfield comparisons Signed-off-by: Eric Paris --- kernel/auditsc.c | 50 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8fb2c8e6d624..b12cc32fe377 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -463,25 +463,53 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_compare_id(uid_t uid1, + struct audit_names *name, + unsigned long name_offset, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + unsigned long addr; + uid_t uid2; + int rc; + + if (name) { + addr = (unsigned long)name; + addr += name_offset; + + uid2 = *(uid_t *)addr; + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + addr = (unsigned long)n; + addr += name_offset; + + uid2 = *(uid_t *)addr; + + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + } + return 0; +} + static int audit_field_compare(struct task_struct *tsk, const struct cred *cred, struct audit_field *f, struct audit_context *ctx, struct audit_names *name) { - struct audit_names *n; - switch (f->val) { case AUDIT_COMPARE_UID_TO_OBJ_UID: - if (name) { - return audit_comparator(cred->uid, f->op, name->uid); - } else if (ctx) { - list_for_each_entry(n, &ctx->names_list, list) { - if (audit_comparator(cred->uid, f->op, n->uid)) - return 1; - } - } - break; + return audit_compare_id(cred->uid, + name, offsetof(struct audit_names, uid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3 From c9fe685f7a17a0ee8bf3fbe51e40b1c8b8e65896 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: allow interfield comparison between gid and ogid Allow audit rules to compare the gid of the running task to the gid of the inode in question. Signed-off-by: Eric Paris --- kernel/auditsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b12cc32fe377..861c7b9c565a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -474,6 +474,8 @@ static int audit_compare_id(uid_t uid1, uid_t uid2; int rc; + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + if (name) { addr = (unsigned long)name; addr += name_offset; @@ -510,6 +512,10 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->uid, name, offsetof(struct audit_names, uid), f, ctx); + case AUDIT_COMPARE_GID_TO_OBJ_GID: + return audit_compare_id(cred->gid, + name, offsetof(struct audit_names, gid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3 From 4a6633ed08af5ba67790b4d1adcdeb8ceb55677e Mon Sep 17 00:00:00 2001 From: Peter Moody Date: Tue, 13 Dec 2011 16:17:51 -0800 Subject: audit: implement all object interfield comparisons This completes the matrix of interfield comparisons between uid/gid information for the current task and the uid/gid information for inodes. aka I can audit based on differences between the euid of the process and the uid of fs objects. Signed-off-by: Peter Moody Signed-off-by: Eric Paris --- kernel/auditsc.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 861c7b9c565a..b8cee462b99e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -508,6 +508,7 @@ static int audit_field_compare(struct task_struct *tsk, struct audit_names *name) { switch (f->val) { + /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: return audit_compare_id(cred->uid, name, offsetof(struct audit_names, uid), @@ -516,6 +517,34 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->gid, name, offsetof(struct audit_names, gid), f, ctx); + case AUDIT_COMPARE_EUID_TO_OBJ_UID: + return audit_compare_id(cred->euid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_EGID_TO_OBJ_GID: + return audit_compare_id(cred->egid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_AUID_TO_OBJ_UID: + return audit_compare_id(tsk->loginuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SUID_TO_OBJ_UID: + return audit_compare_id(cred->suid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SGID_TO_OBJ_GID: + return audit_compare_id(cred->sgid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_FSUID_TO_OBJ_UID: + return audit_compare_id(cred->fsuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_FSGID_TO_OBJ_GID: + return audit_compare_id(cred->fsgid, + name, offsetof(struct audit_names, gid), + f, ctx); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3 From 10d68360871657204885371cdf2594412675d2f9 Mon Sep 17 00:00:00 2001 From: Peter Moody Date: Wed, 4 Jan 2012 15:24:31 -0500 Subject: audit: comparison on interprocess fields This allows audit to specify rules in which we compare two fields of a process. Such as is the running process uid != to the running process euid? Signed-off-by: Peter Moody Signed-off-by: Eric Paris --- kernel/auditsc.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b8cee462b99e..593237e3654d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -545,6 +545,45 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_id(cred->fsgid, name, offsetof(struct audit_names, gid), f, ctx); + /* uid comparisons */ + case AUDIT_COMPARE_UID_TO_AUID: + return audit_comparator(cred->uid, f->op, tsk->loginuid); + case AUDIT_COMPARE_UID_TO_EUID: + return audit_comparator(cred->uid, f->op, cred->euid); + case AUDIT_COMPARE_UID_TO_SUID: + return audit_comparator(cred->uid, f->op, cred->suid); + case AUDIT_COMPARE_UID_TO_FSUID: + return audit_comparator(cred->uid, f->op, cred->fsuid); + /* auid comparisons */ + case AUDIT_COMPARE_AUID_TO_EUID: + return audit_comparator(tsk->loginuid, f->op, cred->euid); + case AUDIT_COMPARE_AUID_TO_SUID: + return audit_comparator(tsk->loginuid, f->op, cred->suid); + case AUDIT_COMPARE_AUID_TO_FSUID: + return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + /* euid comparisons */ + case AUDIT_COMPARE_EUID_TO_SUID: + return audit_comparator(cred->euid, f->op, cred->suid); + case AUDIT_COMPARE_EUID_TO_FSUID: + return audit_comparator(cred->euid, f->op, cred->fsuid); + /* suid comparisons */ + case AUDIT_COMPARE_SUID_TO_FSUID: + return audit_comparator(cred->suid, f->op, cred->fsuid); + /* gid comparisons */ + case AUDIT_COMPARE_GID_TO_EGID: + return audit_comparator(cred->gid, f->op, cred->egid); + case AUDIT_COMPARE_GID_TO_SGID: + return audit_comparator(cred->gid, f->op, cred->sgid); + case AUDIT_COMPARE_GID_TO_FSGID: + return audit_comparator(cred->gid, f->op, cred->fsgid); + /* egid comparisons */ + case AUDIT_COMPARE_EGID_TO_SGID: + return audit_comparator(cred->egid, f->op, cred->sgid); + case AUDIT_COMPARE_EGID_TO_FSGID: + return audit_comparator(cred->egid, f->op, cred->fsgid); + /* sgid comparison */ + case AUDIT_COMPARE_SGID_TO_FSGID: + return audit_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; -- cgit v1.2.3 From 5afb8a3f96573f7ea018abb768f5b6ebe1a6c1a4 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 20 Dec 2011 18:39:41 -0500 Subject: audit: fix signedness bug in audit_log_execve_info() In the loop, a size_t "len" is used to hold the return value of audit_log_single_execve_arg(), which returns -1 on error. In that case the error handling (len <= 0) will be bypassed since "len" is unsigned, and the loop continues with (p += len) being wrapped. Change the type of "len" to signed int to fix the error handling. size_t len; ... for (...) { len = audit_log_single_execve_arg(...); if (len <= 0) break; p += len; } Signed-off-by: Xi Wang Signed-off-by: Eric Paris --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 593237e3654d..86584ecb1039 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1362,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i; - size_t len, len_sent = 0; + int i, len; + size_t len_sent = 0; const char __user *p; char *buf; -- cgit v1.2.3 From c158a35c8a681cf68d36f22f058f9f5466386c71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jan 2012 14:07:10 -0800 Subject: audit: no leading space in audit_log_d_path prefix audit_log_d_path() injects an additional space before the prefix, which serves no purpose and doesn't mix well with other audit_log*() functions that do not sneak extra characters into the log. Signed-off-by: Kees Cook Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- kernel/auditsc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 00efe4758c86..705c25a70bff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, char *p, *pathname; if (prefix) - audit_log_format(ab, " %s", prefix); + audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 86584ecb1039..caaea6e944f8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1171,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, "exe=", + audit_log_d_path(ab, " exe=", &vma->vm_file->f_path); break; } @@ -1540,7 +1540,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case 0: /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); + audit_log_d_path(ab, " name=", &context->pwd); break; default: /* log the name's directory component */ @@ -1725,7 +1725,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_d_path(ab, " cwd=", &context->pwd); audit_log_end(ab); } } -- cgit v1.2.3