From 3588a085cd52ef080bf72df772378e1ba6bb292f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 Feb 2008 17:45:13 +0100 Subject: hrtimer: fix hrtimer_init_sleeper() users this patch: commit 37bb6cb4097e29ffee970065b74499cbf10603a3 Author: Peter Zijlstra Date: Fri Jan 25 21:08:32 2008 +0100 hrtimer: unlock hrtimer_wakeup Broke hrtimer_init_sleeper() users. It forgot to fix up the futex caller of this function to detect the failed queueing and messed up the do_nanosleep() caller in that it could leak a TASK_INTERRUPTIBLE state. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 ++ kernel/hrtimer.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index db9824de8bf0..0edd314798c0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1252,6 +1252,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, t.timer.expires = *abs_time; hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; /* * the timer could have already expired, in which diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index bd5d6b5060bc..1069998fe25f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1315,6 +1315,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod } while (t->task && !signal_pending(current)); + __set_current_state(TASK_RUNNING); + return t->task == NULL; } -- cgit v1.2.3 From 1001d0a9ee74a468077dfd4da0565174e88de26b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:13 +0100 Subject: timekeeping: update xtime_cache when time(zone) changes xtime_cache needs to be updated whenever xtime and or wall_to_monotic are changed. Otherwise users of xtime_cache might see a stale (and in the case of timezone changes utterly wrong) value until the next update happens. Fixup the obvious places, which miss this update. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Dhaval Giani Signed-off-by: Ingo Molnar --- kernel/time.c | 1 + kernel/time/timekeeping.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 09d3c45c4da7..4064c0566e77 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -129,6 +129,7 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 092a2366b5a9..cd5dbc4579c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -47,7 +47,7 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ static struct timespec xtime_cache __attribute__ ((aligned (16))); -static inline void update_xtime_cache(u64 nsec) +void update_xtime_cache(u64 nsec) { xtime_cache = xtime; timespec_add_ns(&xtime_cache, nsec); @@ -145,6 +145,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + update_xtime_cache(0); clock->error = 0; ntp_clear(); @@ -252,8 +253,8 @@ void __init timekeeping_init(void) xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + update_xtime_cache(0); total_sleep_time = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -290,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) } /* Make sure that we have the correct xtime reference */ timespec_add_ns(&xtime, timekeeping_suspend_nsecs); + update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; -- cgit v1.2.3 From 5df7fa1c62146a0933767d040d400013310dbcc7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: tick-sched: add more debug information To allow better diagnosis of tick-sched related, especially NOHZ related problems, we need to know when the last wakeup via an irq happened and when the CPU left the idle state. Add two fields (idle_waketime, idle_exittime) to the tick_sched structure and add them to the timer_list output. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 ++ kernel/time/timer_list.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 63f24b550695..88267f0a8471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -137,6 +137,7 @@ void tick_nohz_update_jiffies(void) cpu_clear(cpu, nohz_cpu_mask); now = ktime_get(); + ts->idle_waketime = now; local_irq_save(flags); tick_do_update_jiffies64(now); @@ -400,6 +401,7 @@ void tick_nohz_restart_sched_tick(void) * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; + ts->idle_exittime = now; hrtimer_cancel(&ts->sched_timer); ts->sched_timer.expires = ts->idle_tick; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 12c5f4cb6b8c..d3d94c1a0fd2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -166,6 +166,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); P_ns(idle_sleeptime); P(last_jiffies); P(next_jiffies); -- cgit v1.2.3 From 83e96c604e781098a2f61b8a294919bcf3abfab4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: futex: Remove warn on in return fixup path The WARN_ON() in the fixup return path of futex_lock_pi() can trigger with false positives. The following scenario happens: t1 holds the futex and t2 and t3 are blocked on the kernel side rt_mutex. t1 releases the futex (and the rt_mutex) and assigned t2 to be the next owner of the futex. t2 is interrupted and returns w/o acquiring the rt_mutex, before t1 can release the rtmutex. t1 releases the rtmutex and t3 becomes the pending owner of the rtmutex. t2 notices that it is the designated owner (user space variable) and fails to acquire the rt_mutex via trylock, because it is not allowed to steal the rt_mutex from t3. Now it looks at the rt_mutex pending owner (t3) and assigns the futex and the pi_state to it. During the fixup t4 steals the rtmutex from t3. t2 returns from the fixup and the owner of the rt_mutex has changed from t3 to t4. There is no need to do another round of fixups from t2. The important part (t2 is not returning as the user space visible owner) is done. The further fixups are done, before either t3 or t4 return to user space. For the user space it is not relevant which task (t3 or t4) is the real owner, as long as those are both in the kernel, which is guaranteed by the serialization of the hash bucket lock. Both tasks (which ever returns first to userspace - t4 because it locked the rt_mutex or t3 due to a signal) are going through the lock_futex_pi() return path where the ownership is fixed before the return to user space. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0edd314798c0..0006d64c448e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1537,9 +1537,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, owner = rt_mutex_owner(&q.pi_state->pi_mutex); res = fixup_pi_state_owner(uaddr, &q, owner); - WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) != - owner); - /* propagate -EFAULT, if the fixup failed */ if (res) ret = res; -- cgit v1.2.3 From cd689985cf49f6ff5c8eddc48d98b9d581d9475d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: futex: Add bitset conditional wait/wakeup functionality To allow the implementation of optimized rw-locks in user space, glibc needs a possibility to select waiters for wakeup depending on a bitset mask. This requires two new futex OPs: FUTEX_WAIT_BITS and FUTEX_WAKE_BITS These OPs are basically the same as FUTEX_WAIT and FUTEX_WAKE plus an additional argument - a bitset. Further the FUTEX_WAIT_BITS OP is expecting an absolute timeout value instead of the relative one, which is used for the FUTEX_WAIT OP. FUTEX_WAIT_BITS calls into the kernel with a bitset. The bitset is stored in the futex_q structure, which is used to enqueue the waiter into the hashed futex waitqueue. FUTEX_WAKE_BITS also calls into the kernel with a bitset. The wakeup function logically ANDs the bitset with the bitset stored in each waiters futex_q structure. If the result is zero (i.e. none of the set bits in the bitsets is matching), then the waiter is not woken up. If the result is not zero (i.e. one of the set bits in the bitsets is matching), then the waiter is woken. The bitset provided by the caller must be non zero. In case the provided bitset is zero the kernel returns EINVAL. Internaly the new OPs are only extensions to the existing FUTEX_WAIT and FUTEX_WAKE functions. The existing OPs hand a bitset with all bits set into the futex_wait() and futex_wake() functions. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/futex.c | 37 ++++++++++++++++++++++++++++++------- kernel/futex_compat.c | 3 ++- 2 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0006d64c448e..a6baaec44b8f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -109,6 +109,9 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; + + /* Bitset for the optional bitmasked wakeup */ + u32 bitset; }; /* @@ -722,7 +725,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * to this virtual address: */ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake) + int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -730,6 +733,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key key; int ret; + if (!bitset) + return -EINVAL; + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); @@ -746,6 +752,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, ret = -EINVAL; break; } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + wake_futex(this); if (++ret >= nr_wake) break; @@ -1156,7 +1167,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, static long futex_wait_restart(struct restart_block *restart); static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, - u32 val, ktime_t *abs_time) + u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1167,7 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, struct hrtimer_sleeper t; int rem = 0; + if (!bitset) + return -EINVAL; + q.pi_state = NULL; + q.bitset = bitset; retry: futex_lock_mm(fshared); @@ -1295,6 +1310,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, restart->futex.uaddr = (u32 *)uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; restart->futex.flags = 0; if (fshared) @@ -1321,7 +1337,8 @@ static long futex_wait_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = ¤t->mm->mmap_sem; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t); + return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + restart->futex.bitset); } @@ -1942,7 +1959,8 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1, + FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -2042,10 +2060,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, fshared, val, timeout); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + ret = futex_wait(uaddr, fshared, val, timeout, val3); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, fshared, val); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ @@ -2085,7 +2107,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, u32 val2 = 0; int cmd = op & FUTEX_CMD_MASK; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 0a43def6fee7..133d558db452 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -167,7 +167,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int val2 = 0; int cmd = op & FUTEX_CMD_MASK; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) -- cgit v1.2.3