From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 May 2008 23:00:01 +0200 Subject: Introduce new top level suspend and hibernation callbacks Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning 'extended') representing suspend and hibernation operations for bus types, device classes, device types and device drivers. Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops' objects, if defined, instead of the ->suspend(), ->resume(), ->suspend_late(), and ->resume_early() callbacks (the old callbacks will be considered as legacy and gradually phased out). The main purpose of doing this is to separate suspend (aka S2RAM and standby) callbacks from hibernation callbacks in such a way that the new callbacks won't take arguments and the semantics of each of them will be clearly specified. This has been requested for multiple times by many people, including Linus himself, and the reason is that within the current scheme if ->resume() is called, for example, it's difficult to say why it's been called (ie. is it a resume from RAM or from hibernation or a suspend/hibernation failure etc.?). The second purpose is to make the suspend/hibernation callbacks more flexible so that device drivers can handle more than they can within the current scheme. For example, some drivers may need to prevent new children of the device from being registered before their ->suspend() callbacks are executed or they may want to carry out some operations requiring the availability of some other devices, not directly bound via the parent-child relationship, in order to prepare for the execution of ->suspend(), etc. Ultimately, we'd like to stop using the freezing of tasks for suspend and therefore the drivers' suspend/hibernation code will have to take care of the handling of the user space during suspend/hibernation. That, in turn, would be difficult within the current scheme, without the new ->prepare() and ->complete() callbacks. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- kernel/power/disk.c | 22 +++++++++++++++------- kernel/power/main.c | 6 ++++-- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc652..d416be0efa8a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -193,6 +193,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +225,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Resume_console: resume_console(); Close: @@ -300,8 +304,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +334,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -403,6 +409,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +418,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,7 +427,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESTORE); Resume_console: resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524e..d023b6b584e5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESUME); Resume_console: resume_console(); Close: -- cgit v1.2.3 From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Jun 2008 23:24:06 +0200 Subject: Suspend-related patches for 2.6.27 ACPI PM: Add possibility to change suspend sequence There are some systems out there that don't work correctly with our current suspend/hibernation code ordering. Provide a workaround for these systems allowing them to pass 'acpi_sleep=old_ordering' in the kernel command line so that it will use the pre-ACPI 2.0 ("old") suspend code ordering. Unfortunately, this requires us to add a platform hook to the resuming of devices for recovering the platform in case one of the device drivers' .suspend() routines returns error code. Namely, ACPI 1.0 specifies that _PTS should be called before suspending devices, but _WAK still should be called before resuming them in order to undo the changes made by _PTS. However, if there is an error during suspending devices, they are automatically resumed without returning control to the PM core, so the _WAK has to be called from within device_resume() in that cases. The patch also reorders and refactors the ACPI suspend/hibernation code to avoid duplication as far as reasonably possible. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- kernel/power/disk.c | 28 ++++++++++++++++++++++------ kernel/power/main.c | 10 +++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d416be0efa8a..f011e0870b52 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode) hibernation_ops->restore_cleanup(); } +/** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + /** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control @@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode) suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) - goto Resume_console; + goto Recover_platform; if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) @@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Resume_console: resume_console(); Close: platform_end(platform_mode); return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; } /** @@ -398,8 +412,11 @@ int hibernation_platform_enter(void) suspend_console(); error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } error = hibernation_ops->prepare(); if (error) @@ -428,7 +445,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - Resume_console: resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index d023b6b584e5..3398f4651aa1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state) error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; + goto Recover_platform; } if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; if (suspend_ops->prepare) { error = suspend_ops->prepare(); @@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_ops->finish(); Resume_devices: device_resume(PMSG_RESUME); - Resume_console: resume_console(); Close: if (suspend_ops->end) suspend_ops->end(); return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; } /** -- cgit v1.2.3 From 0775b3dbcb6d17b531b36df520ddab735647f3f7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:08 -0700 Subject: suspend, xen: enable PM_SLEEP for CONFIG_XEN Xen save/restore requires PM_SLEEP to be set without requiring SUSPEND or HIBERNATION. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d25..1436c47cb39b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN default y config SUSPEND -- cgit v1.2.3 From 6717ef1aa750b54ddb9d8854b91707ee21f0ad23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 22:17:01 +0200 Subject: Revert "suspend, xen: enable PM_SLEEP for CONFIG_XEN" This reverts commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4. Rafael doesnt like it - it breaks various assumptions. Signed-off-by: Ingo Molnar --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 1436c47cb39b..b45da40e8d25 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION || XEN + depends on SUSPEND || HIBERNATION default y config SUSPEND -- cgit v1.2.3 From 98abed02007b19bbfd68b6d06a5485afc3eeb01b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 19 Mar 2008 19:24:59 -0700 Subject: do_wait reorganization This breaks out the guts of do_wait into three subfunctions. The control flow is less nonobvious without so much goto. do_wait_thread and ptrace_do_wait contain the main work of the outer loop. wait_consider_task contains the main work of the inner loop. Signed-off-by: Roland McGrath --- kernel/exit.c | 215 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 135 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ceb258782835..7453356a961f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1238,7 +1238,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(struct task_struct *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1246,7 +1246,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap, int retval, status, traced; pid_t pid = task_pid_vnr(p); - if (unlikely(noreap)) { + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { uid_t uid = p->uid; int exit_code = p->exit_code; int why, status; @@ -1397,13 +1400,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * released the lock and the system call should return. */ static int wait_task_stopped(struct task_struct *p, - int noreap, struct siginfo __user *infop, + int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { int retval, exit_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; + if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); @@ -1421,7 +1427,7 @@ static int wait_task_stopped(struct task_struct *p, if (!exit_code) goto unlock_sig; - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->exit_code = 0; uid = p->uid; @@ -1442,7 +1448,7 @@ unlock_sig: why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) + if (unlikely(options & WNOWAIT)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); @@ -1476,7 +1482,7 @@ unlock_sig: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(struct task_struct *p, int noreap, +static int wait_task_continued(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1484,6 +1490,9 @@ static int wait_task_continued(struct task_struct *p, int noreap, pid_t pid; uid_t uid; + if (!unlikely(options & WCONTINUED)) + return 0; + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1493,7 +1502,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, spin_unlock_irq(&p->sighand->siglock); return 0; } - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); @@ -1519,89 +1528,137 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (ret <= 0) + return ret; + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(p, options, infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + int ret = wait_consider_task(tsk, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * If we never saw an eligile child, check for children stolen by + * ptrace. We don't leave -ECHILD in *@notask_error if there are any, + * because we will eventually be allowed to wait for them again. + */ + if (!*notask_error) + return 0; + + list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) { + int ret = eligible_child(type, pid, options, p); + if (unlikely(ret < 0)) + return ret; + if (ret) { + *notask_error = 0; + return 0; + } + } + + return 0; +} + static long do_wait(enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; - int flag, retval; + int retval; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: - /* If there is nothing that can match our critier just get out */ + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ retval = -ECHILD; if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) goto end; - /* - * We will set this flag if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - flag = retval = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - int ret = eligible_child(type, pid, options, p); - if (!ret) - continue; - - if (unlikely(ret < 0)) { - retval = ret; - } else if (task_is_stopped_or_traced(p)) { - /* - * It's stopped now, so it might later - * continue, exit, or stop again. - */ - flag = 1; - if (!(p->ptrace & PT_PTRACED) && - !(options & WUNTRACED)) - continue; - - retval = wait_task_stopped(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state == EXIT_ZOMBIE && - !delay_group_leader(p)) { - /* - * We don't reap group leaders with subthreads. - */ - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state != EXIT_DEAD) { - /* - * It's running now, so it might later - * exit, stop, or stop and then continue. - */ - flag = 1; - if (!unlikely(options & WCONTINUED)) - continue; - retval = wait_task_continued(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } - if (retval != 0) /* tasklist_lock released */ - goto end; - } - if (!flag) { - list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - flag = eligible_child(type, pid, options, p); - if (!flag) - continue; - if (likely(flag > 0)) - break; - retval = flag; - goto end; - } + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; } + if (options & __WNOTHREAD) break; tsk = next_thread(tsk); @@ -1609,16 +1666,14 @@ repeat: } while (tsk != current); read_unlock(&tasklist_lock); - if (flag) { - if (options & WNOHANG) - goto end; + if (!retval && !(options & WNOHANG)) { retval = -ERESTARTSYS; - if (signal_pending(current)) - goto end; - schedule(); - goto repeat; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } } - retval = -ECHILD; + end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); -- cgit v1.2.3 From f470021adb9190819c03d6d8c5c860a17480aa6d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 24 Mar 2008 18:36:23 -0700 Subject: ptrace children revamp ptrace no longer fiddles with the children/sibling links, and the old ptrace_children list is gone. Now ptrace, whether of one's own children or another's via PTRACE_ATTACH, just uses the new ptraced list instead. There should be no user-visible difference that matters. The only change is the order in which do_wait() sees multiple stopped children and stopped ptrace attachees. Since wait_task_stopped() was changed earlier so it no longer reorders the children list, we already know this won't cause any new problems. Signed-off-by: Roland McGrath --- kernel/exit.c | 226 +++++++++++++++++++++++++++++--------------------------- kernel/fork.c | 6 +- kernel/ptrace.c | 37 ++++++---- 3 files changed, 146 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7453356a961f..1e909826a804 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p) __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - remove_parent(p); + list_del_init(&p->sibling); } /* @@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } +/* + * Do final ptrace-related cleanup of a zombie being reaped. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_release_task(struct task_struct *p) +{ + BUG_ON(!list_empty(&p->ptraced)); + ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_entry)); +} + void release_task(struct task_struct * p) { struct task_struct *leader; @@ -160,8 +172,7 @@ repeat: atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + ptrace_release_task(p); __exit_signal(p); /* @@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void) ptrace_unlink(current); /* Reparent to init */ - remove_parent(current); current->real_parent = current->parent = kthreadd_task; - add_parent(current); + list_move_tail(¤t->sibling, ¤t->real_parent->children); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -692,37 +702,71 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static void -reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) { - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + struct task_struct *p, *n; - /* Move the child from its dying parent to the new one. */ - if (unlikely(traced)) { - /* Preserve ptrace links if someone else is tracing this child. */ - list_del_init(&p->ptrace_list); - if (ptrace_reparented(p)) - list_add(&p->ptrace_list, &p->real_parent->ptrace_children); - } else { - /* If this child is being traced, then we're the one tracing it - * anyway, so let go of it. + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. */ - p->ptrace = 0; - remove_parent(p); - p->parent = p->real_parent; - add_parent(p); + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + } - if (task_is_traced(p)) { + if (task_detached(p)) { /* - * If it was at a trace stop, turn it into - * a normal stop since it's no longer being - * traced. + * Mark it as in the process of being reaped. */ - ptrace_untrace(p); + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); } } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); /* If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -737,7 +781,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) /* If we'd notified the old parent about this child's death, * also notify the new parent. */ - if (!traced && p->exit_state == EXIT_ZOMBIE && + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); @@ -754,12 +799,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) static void forget_original_parent(struct task_struct *father) { struct task_struct *p, *n, *reaper = father; - struct list_head ptrace_dead; - - INIT_LIST_HEAD(&ptrace_dead); + LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + do { reaper = next_thread(reaper); if (reaper == father) { @@ -768,58 +816,19 @@ static void forget_original_parent(struct task_struct *father) } } while (reaper->flags & PF_EXITING); - /* - * There are only two places where our children can be: - * - * - in our child list - * - in our ptraced child list - * - * Search them and reparent children. - */ list_for_each_entry_safe(p, n, &father->children, sibling) { - int ptrace; - - ptrace = p->ptrace; - - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); - - if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - p->real_parent = reaper; - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ - __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - } - - /* - * if the ptraced child is a detached zombie we must collect - * it before we exit, or it will remain zombie forever since - * we prevented it from self-reap itself while it was being - * traced by us, to be able to see it in wait4. - */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) - list_add(&p->ptrace_list, &ptrace_dead); - } - - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { p->real_parent = reaper; - reparent_thread(p, father, 1); + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); } write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&father->children)); - BUG_ON(!list_empty(&father->ptrace_children)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { - list_del_init(&p->ptrace_list); - release_task(p); - } + ptrace_exit_finish(father, &ptrace_dead); } /* @@ -1180,13 +1189,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; } - /* - * Do not consider detached threads that are - * not ptraced: - */ - if (task_detached(p) && !p->ptrace) - return 0; - /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1399,7 +1401,7 @@ static int wait_task_zombie(struct task_struct *p, int options, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, +static int wait_task_stopped(int ptrace, struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1407,7 +1409,7 @@ static int wait_task_stopped(struct task_struct *p, uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; - if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED)) + if (!(options & WUNTRACED)) return 0; exit_code = 0; @@ -1416,7 +1418,7 @@ static int wait_task_stopped(struct task_struct *p, if (unlikely(!task_is_stopped_or_traced(p))) goto unlock_sig; - if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) + if (!ptrace && p->signal->group_stop_count > 0) /* * A group stop is in progress and this is the group leader. * We won't report until all threads have stopped. @@ -1445,7 +1447,7 @@ unlock_sig: */ get_task_struct(p); pid = task_pid_vnr(p); - why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); if (unlikely(options & WNOWAIT)) @@ -1536,7 +1538,7 @@ static int wait_task_continued(struct task_struct *p, int options, * Returns zero if the search for a child should continue; * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. */ -static int wait_consider_task(struct task_struct *parent, +static int wait_consider_task(struct task_struct *parent, int ptrace, struct task_struct *p, int *notask_error, enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, @@ -1546,6 +1548,15 @@ static int wait_consider_task(struct task_struct *parent, if (ret <= 0) return ret; + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + if (p->exit_state == EXIT_DEAD) return 0; @@ -1562,7 +1573,8 @@ static int wait_consider_task(struct task_struct *parent, *notask_error = 0; if (task_is_stopped_or_traced(p)) - return wait_task_stopped(p, options, infop, stat_addr, ru); + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); return wait_task_continued(p, options, infop, stat_addr, ru); } @@ -1583,11 +1595,16 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error, struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - int ret = wait_consider_task(tsk, p, notask_error, - type, pid, options, - infop, stat_addr, ru); - if (ret) - return ret; + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } } return 0; @@ -1601,21 +1618,16 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, struct task_struct *p; /* - * If we never saw an eligile child, check for children stolen by - * ptrace. We don't leave -ECHILD in *@notask_error if there are any, - * because we will eventually be allowed to wait for them again. + * Traditionally we see ptrace'd stopped tasks regardless of options. */ - if (!*notask_error) - return 0; + options |= WUNTRACED; - list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) { - int ret = eligible_child(type, pid, options, p); - if (unlikely(ret < 0)) + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) return ret; - if (ret) { - *notask_error = 0; - return 0; - } } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 4bd2f516401f..adefc1131f27 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); + INIT_LIST_HEAD(&p->ptrace_entry); + INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - add_parent(p); + list_add_tail(&p->sibling, &p->real_parent->children); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e337390fce01..8392a9da6450 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -33,13 +33,9 @@ */ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_list)); - if (child->parent == new_parent) - return; - list_add(&child->ptrace_list, &child->parent->ptrace_children); - remove_parent(child); + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; - add_parent(child); } /* @@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); child->ptrace = 0; - if (ptrace_reparented(child)) { - list_del_init(&child->ptrace_list); - remove_parent(child); - child->parent = child->real_parent; - add_parent(child); - } + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); if (task_is_traced(child)) ptrace_untrace(child); @@ -492,15 +484,34 @@ int ptrace_traceme(void) /* * Are we already being traced? */ +repeat: task_lock(current); if (!(current->ptrace & PT_PTRACED)) { + /* + * See ptrace_attach() comments about the locking here. + */ + unsigned long flags; + if (!write_trylock_irqsave(&tasklist_lock, flags)) { + task_unlock(current); + do { + cpu_relax(); + } while (!write_can_lock(&tasklist_lock)); + goto repeat; + } + ret = security_ptrace(current->parent, current, PTRACE_MODE_ATTACH); + /* * Set the ptrace bit in the process ptrace flags. + * Then link us on our parent's ptraced list. */ - if (!ret) + if (!ret) { current->ptrace |= PT_PTRACED; + __ptrace_link(current, current->real_parent); + } + + write_unlock_irqrestore(&tasklist_lock, flags); } task_unlock(current); return ret; -- cgit v1.2.3 From 14dd0b81414a58caf0296dbeace016bb0a5d11ab Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 30 Mar 2008 18:41:25 -0700 Subject: do_wait: return security_task_wait() error code in place of -ECHILD This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3 "do_wait: fix security checks". That change reverted the effect of commit 73243284463a761e04d69d22c7516b2be7de096c. The rationale for the original commit still stands. The inconsistent treatment of children hidden by ptrace was an unintended omission in the original change and in no way invalidates its purpose. This makes do_wait return the error returned by security_task_wait() (usually -EACCES) in place of -ECHILD when there are some children the caller would be able to wait for if not for the permission failure. A permission error will give the user a clue to look for security policy problems, rather than for mysterious wait bugs. Signed-off-by: Roland McGrath --- kernel/exit.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1e909826a804..a2af6cac823c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1199,14 +1199,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; err = security_task_wait(p); - if (likely(!err)) - return 1; + if (err) + return err; - if (type != PIDTYPE_PID) - return 0; - /* This child was explicitly requested, abort */ - read_unlock(&tasklist_lock); - return err; + return 1; } static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, @@ -1536,7 +1532,8 @@ static int wait_task_continued(struct task_struct *p, int options, * -ECHILD should be in *@notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; - * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD. + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. */ static int wait_consider_task(struct task_struct *parent, int ptrace, struct task_struct *p, int *notask_error, @@ -1545,9 +1542,21 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, int __user *stat_addr, struct rusage __user *ru) { int ret = eligible_child(type, pid, options, p); - if (ret <= 0) + if (!ret) return ret; + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + if (likely(!ptrace) && unlikely(p->ptrace)) { /* * This child is hidden by ptrace. @@ -1585,7 +1594,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, * -ECHILD should be in *@notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then - * *@notask_error is 0 if there were any eligible children, or still -ECHILD. + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. */ static int do_wait_thread(struct task_struct *tsk, int *notask_error, enum pid_type type, struct pid *pid, int options, -- cgit v1.2.3 From 666f164f4fbfa78bd00fb4b74788b42a39842c64 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 8 Apr 2008 23:12:30 -0700 Subject: fix dangling zombie when new parent ignores children This fixes an arcane bug that we think was a regression introduced by commit b2b2cbc4b2a2f389442549399a993a8306420baf. When a parent ignores SIGCHLD (or uses SA_NOCLDWAIT), its children would self-reap but they don't if it's using ptrace on them. When the parent thread later exits and ceases to ptrace a child but leaves other live threads in the parent's thread group, any zombie children are left dangling. The fix makes them self-reap then, as they would have done earlier if ptrace had not been in use. Signed-off-by: Roland McGrath --- kernel/exit.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a2af6cac823c..93d2711b9381 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -702,6 +702,23 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) +{ + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} + /* * Detach all tasks we were using ptrace on. * Any that need to be release_task'd are put on the @dead list. @@ -711,6 +728,7 @@ static void exit_mm(struct task_struct * tsk) static void ptrace_exit(struct task_struct *parent, struct list_head *dead) { struct task_struct *p, *n; + int ign = -1; list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { __ptrace_unlink(p); @@ -726,10 +744,18 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead) * release_task() here because we already hold tasklist_lock. * * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. */ if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, parent)) do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } } if (task_detached(p)) { -- cgit v1.2.3 From c349e0a01c3e0f70913db6a5bb61ab204e0602de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Apr 2008 22:39:31 +0200 Subject: ftrace: do not trace scheduler functions do not trace scheduler functions - it's still a bit fragile and can lock up with: http://redhat.com/~mingo/misc/config-Thu_Jul_17_13_34_52_CEST_2008 Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a7ed838984b..985ddb7da4d0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v1.2.3 From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:43:42 -0700 Subject: x86, xen, power: fix up config dependencies on PM Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP depends on PM. So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP depend on XEN_SAVE_RESTORE. Signed-off-by: Jeremy Fitzhardinge Acked-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d25..59dfdf1e1d20 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y config SUSPEND -- cgit v1.2.3