diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 10:42:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-20 10:42:08 -0800 |
commit | a0fa1dd3cdbccec9597fe53b6177a9aa6e20f2f8 (patch) | |
tree | b249854573815eedf377e554f0ea516f86411841 /kernel | |
parent | 9326657abe1a83ed4b4f396b923ca1217fd50cba (diff) | |
parent | eaad45132c564ce377e6dce05e78e08e456d5315 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
- Add the initial implementation of SCHED_DEADLINE support: a real-time
scheduling policy where tasks that meet their deadlines and
periodically execute their instances in less than their runtime quota
see real-time scheduling and won't miss any of their deadlines.
Tasks that go over their quota get delayed (Available to privileged
users for now)
- Clean up and fix preempt_enable_no_resched() abuse all around the
tree
- Do sched_clock() performance optimizations on x86 and elsewhere
- Fix and improve auto-NUMA balancing
- Fix and clean up the idle loop
- Apply various cleanups and fixes
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
sched: Fix __sched_setscheduler() nice test
sched: Move SCHED_RESET_ON_FORK into attr::sched_flags
sched: Fix up attr::sched_priority warning
sched: Fix up scheduler syscall LTP fails
sched: Preserve the nice level over sched_setscheduler() and sched_setparam() calls
sched/core: Fix htmldocs warnings
sched/deadline: No need to check p if dl_se is valid
sched/deadline: Remove unused variables
sched/deadline: Fix sparse static warnings
m68k: Fix build warning in mac_via.h
sched, thermal: Clean up preempt_enable_no_resched() abuse
sched, net: Fixup busy_loop_us_clock()
sched, net: Clean up preempt_enable_no_resched() abuse
sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding
sched/preempt, locking: Rework local_bh_{dis,en}able()
sched/clock, x86: Avoid a runtime condition in native_sched_clock()
sched/clock: Fix up clear_sched_clock_stable()
sched/clock, x86: Use a static_key for sched_clock_stable
sched/clock: Remove local_irq_disable() from the clocks
sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu/idle.c | 17 | ||||
-rw-r--r-- | kernel/fork.c | 12 | ||||
-rw-r--r-- | kernel/futex.c | 2 | ||||
-rw-r--r-- | kernel/hrtimer.c | 3 | ||||
-rw-r--r-- | kernel/locking/rtmutex-debug.c | 8 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 166 | ||||
-rw-r--r-- | kernel/locking/rtmutex_common.h | 23 | ||||
-rw-r--r-- | kernel/sched/Makefile | 5 | ||||
-rw-r--r-- | kernel/sched/clock.c | 78 | ||||
-rw-r--r-- | kernel/sched/core.c | 822 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 216 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 33 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 1640 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 83 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 146 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
-rw-r--r-- | kernel/softirq.c | 39 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 65 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 33 |
24 files changed, 3087 insertions, 323 deletions
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a387..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -105,14 +105,17 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } diff --git a/kernel/fork.c b/kernel/fork.c index dfa736c98d17..294189fc7ac8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1087,8 +1087,10 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); + p->pi_waiters = RB_ROOT; + p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; + p->pi_top_task = NULL; #endif } @@ -1311,7 +1313,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(clone_flags, p); + retval = sched_fork(clone_flags, p); + if (retval) + goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) @@ -1403,13 +1407,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->tgid = p->pid; } - p->pdeath_signal = 0; - p->exit_state = 0; - p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; + p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; diff --git a/kernel/futex.c b/kernel/futex.c index 1ddc4498f1e1..44a1261cb9ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2426,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * code while we sleep on uaddr. */ debug_rt_mutex_init_waiter(&rt_waiter); + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); + RB_CLEAR_NODE(&rt_waiter.tree_entry); rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -46,6 +46,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, unsigned long slack; slack = current->timer_slack_ns; - if (rt_task(current)) + if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -24,7 +24,7 @@ #include <linux/kallsyms.h> #include <linux/syscalls.h> #include <linux/interrupt.h> -#include <linux/plist.h> +#include <linux/rbtree.h> #include <linux/fs.h> #include <linux/debug_locks.h> @@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } @@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..2e960a2bab81 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -14,6 +14,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include "rtmutex_common.h" @@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +static inline int +rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio < right->prio) + return 1; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 1 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return (left->task->dl.deadline < right->task->dl.deadline); + + return 0; +} + +static void +rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &lock->waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + lock->waiters_leftmost = &waiter->tree_entry; + + rb_link_node(&waiter->tree_entry, parent, link); + rb_insert_color(&waiter->tree_entry, &lock->waiters); +} + +static void +rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->tree_entry)) + return; + + if (lock->waiters_leftmost == &waiter->tree_entry) + lock->waiters_leftmost = rb_next(&waiter->tree_entry); + + rb_erase(&waiter->tree_entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree_entry); +} + +static void +rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + task->pi_waiters_leftmost = &waiter->pi_tree_entry; + + rb_link_node(&waiter->pi_tree_entry, parent, link); + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); +} + +static void +rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + return; + + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree_entry); +} + /* - * Calculate task priority from the waiter list priority + * Calculate task priority from the waiter tree priority * - * Return task->normal_prio when the waiter list is empty or when + * Return task->normal_prio when the waiter tree is empty or when * the waiter is not allowed to do priority boosting */ int rt_mutex_getprio(struct task_struct *task) @@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) if (likely(!task_has_pi_waiters(task))) return task->normal_prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, + return min(task_top_pi_waiter(task)->prio, task->normal_prio); } +struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return NULL; + + return task_top_pi_waiter(task)->task; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * @@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); - if (task->prio != prio) + if (task->prio != prio || dl_prio(prio)) rt_mutex_setprio(task, prio); } @@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * When deadlock detection is off then we check, if further * priority adjustment is necessary. */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) + if (!detect_deadlock && waiter->prio == task->prio) goto out_unlock_pi; lock = waiter->lock; @@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); + waiter->prio = task->prio; + rt_mutex_enqueue(lock, waiter); /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, top_waiter); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } else if (top_waiter == waiter) { /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } @@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (task->prio >= rt_mutex_top_waiter(lock)->prio) { if (!waiter || waiter != rt_mutex_top_waiter(lock)) return 0; } @@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* remove the queued waiter. */ if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); task->pi_blocked_on = NULL; } @@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) { top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, top); } raw_spin_unlock_irqrestore(&task->pi_lock, flags); } @@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); + waiter->prio = task->prio; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_enqueue(lock, waiter); task->pi_blocked_on = waiter; @@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, top_waiter); + rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) @@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * boosted mode and go back to normal after releasing * lock->wait_lock. */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + rt_mutex_dequeue_pi(current, waiter); rt_mutex_set_owner(lock, NULL); @@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock, int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); @@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, waiter); if (rt_mutex_has_waiters(lock)) { struct rt_mutex_waiter *next; next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); + rt_mutex_enqueue_pi(owner, next); } __rt_mutex_adjust_prio(owner); @@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!chain_walk) return; @@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { + if (!waiter || (waiter->prio == task->prio && + !dl_prio(task->prio))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); + RB_CLEAR_NODE(&waiter.pi_tree_entry); + RB_CLEAR_NODE(&waiter.tree_entry); raw_spin_lock(&lock->wait_lock); @@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); + lock->waiters = RB_ROOT; + lock->waiters_leftmost = NULL; debug_rt_mutex_init(lock, name); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @tree_entry: pi node to enqueue into the mutex waiters tree + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; + struct rb_node tree_entry; + struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -54,14 +54,15 @@ struct rt_mutex_waiter { struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif + int prio; }; /* - * Various helpers to access the waiters-plist: + * Various helpers to access the waiters-tree: */ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !plist_head_empty(&lock->wait_list); + return !RB_EMPTY_ROOT(&lock->waiters); } static inline struct rt_mutex_waiter * @@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, - list_entry); + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, + tree_entry); BUG_ON(w->lock != lock); return w; @@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !plist_head_empty(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, - pi_list_entry); + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, + pi_tree_entry); } /* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..6bd6a6731b21 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) * local_clock() -- is cpu_clock() on the current cpu. * + * sched_clock_cpu(i) + * * How: * * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -66,6 +58,8 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/static_key.h> +#include <linux/workqueue.h> /* * Scheduler clock - returns current time in nanosec units. @@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; + +int sched_clock_stable(void) +{ + if (static_key_false(&__sched_clock_stable)) + return false; + return true; +} + +void set_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + static_key_slow_dec(&__sched_clock_stable); +} + +static void __clear_sched_clock_stable(struct work_struct *work) +{ + /* XXX worry about clock continuity */ + if (sched_clock_stable()) + static_key_slow_inc(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ + if (keventd_up()) + schedule_work(&sched_clock_work); + else + __clear_sched_clock_stable(&sched_clock_work); +} struct sched_clock_data { u64 tick_raw; @@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); - - if (sched_clock_stable) + if (sched_clock_stable()) return sched_clock(); if (unlikely(!sched_clock_running)) return 0ull; + preempt_disable(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable(); return clock; } @@ -265,7 +289,7 @@ void sched_clock_tick(void) struct sched_clock_data *scd; u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; if (unlikely(!sched_clock_running)) @@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(cpu); - return clock; + return sched_clock(); } /* @@ -335,14 +355,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - u64 clock; - unsigned long flags; + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(raw_smp_processor_id()); - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); - - return clock; + return sched_clock(); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu) u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + return sched_clock(); } u64 local_clock(void) { - return sched_clock_cpu(0); + return sched_clock(); } #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..36c951b7eef8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,8 +296,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; - - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + prio = MAX_DL_PRIO-1; + else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) + } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -1499,8 +1499,7 @@ void scheduler_ipi(void) * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ - if (tif_need_resched()) - set_preempt_need_resched(); + preempt_fold_need_resched(); if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) @@ -1717,6 +1716,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif + RB_CLEAR_NODE(&p->dl.rb_node); + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + p->dl.dl_runtime = p->dl.runtime = 0; + p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.dl_period = 0; + p->dl.flags = 0; + INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1768,7 +1774,7 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -1790,7 +1796,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; @@ -1807,8 +1813,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { p->sched_class = &fair_sched_class; + } if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -1834,11 +1846,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu(); + return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; } +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -2003,6 +2128,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (unlikely(prev_state == TASK_DEAD)) { task_numa_free(prev); + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2296,7 +2424,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + trigger_load_balance(rq); #endif rq_last_tick_reset(rq); } @@ -2414,10 +2542,10 @@ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. + * schedule() atomically, we ignore that path. Otherwise whine + * if we are scheduling when we should not. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) __schedule_bug(prev); rcu_sleep_check(); @@ -2761,11 +2889,11 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; + int oldprio, on_rq, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; - BUG_ON(prio < 0 || prio > MAX_PRIO); + BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); @@ -2788,6 +2916,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); + p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -2797,23 +2926,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (rt_prio(prio)) + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || (p->pi_top_task && + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + p->dl.dl_throttled = 0; + enqueue_flag = ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; + p->sched_class = &dl_sched_class; + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag = ENQUEUE_HEAD; p->sched_class = &rt_sched_class; - else + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; p->sched_class = &fair_sched_class; + } p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: __task_rq_unlock(rq); } #endif + void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -2831,9 +2986,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -2988,22 +3143,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + init_dl_task_timer(dl_se); + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) { + int policy = attr->sched_policy; + + if (policy == -1) /* setparam */ + policy = p->policy; + p->policy = policy; - p->rt_priority = prio; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; + set_load_weight(p); } +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ + return attr && attr->sched_deadline != 0 && + (attr->sched_period == 0 || + (s64)(attr->sched_period - attr->sched_deadline) >= 0) && + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && + attr->sched_runtime >= (2 << (DL_SCALE - 1)); +} + /* * check the target process has a UID that matches the current process's */ @@ -3020,10 +3248,12 @@ static bool check_same_owner(struct task_struct *p) return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; + int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -3037,31 +3267,40 @@ recheck: reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_FIFO && policy != SCHED_RR && + if (policy != SCHED_DEADLINE && + policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; } + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (attr->sched_nice < TASK_NICE(p) && + !can_nice(p, attr->sched_nice)) + return -EPERM; + } + if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -3071,8 +3310,8 @@ recheck: return -EPERM; /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) return -EPERM; } @@ -3120,14 +3359,21 @@ recheck: /* * If not changing anything there's no need to proceed further: */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy)) + goto change; + task_rq_unlock(rq, p, &flags); return 0; } +change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -3138,8 +3384,24 @@ recheck: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3147,6 +3409,17 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3158,7 +3431,7 @@ recheck: oldprio = p->prio; prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); + __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); @@ -3173,6 +3446,26 @@ recheck: return 0; } +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* + * Fixup the legacy SCHED_RESET_ON_FORK hack + */ + if (policy & SCHED_RESET_ON_FORK) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check); +} /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. @@ -3186,10 +3479,16 @@ recheck: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return _sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -3206,7 +3505,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return _sched_setscheduler(p, policy, param, false); } static int @@ -3231,6 +3530,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, -20, 19); + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -3262,6 +3634,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) } /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + */ +SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0) + return -EINVAL; + + if (sched_copy_attr(uattr, &attr)) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + +/** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. * @@ -3316,6 +3715,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; + if (task_has_dl_policy(p)) { + retval = -EINVAL; + goto out_unlock; + } lp.sched_priority = p->rt_priority; rcu_read_unlock(); @@ -3331,6 +3734,96 @@ out_unlock: return retval; } +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + goto err_size; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, usize); + if (ret) + return -EFAULT; + +out: + return ret; + +err_size: + ret = -E2BIG; + goto out; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + */ +SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = TASK_NICE(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; @@ -3375,8 +3868,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif again: retval = set_cpus_allowed_ptr(p, new_mask); @@ -3653,7 +4164,7 @@ again: } double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { + if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } @@ -3742,6 +4253,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -3768,6 +4280,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_RR: ret = 1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -4514,13 +5027,31 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned long flags; + long cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); + set_cpu_active(cpu, false); + + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + struct dl_bw *dl_b = dl_bw_of(cpu); + bool overflow; + int cpus; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (overflow) + return notifier_from_errno(-EBUSY); + } return NOTIFY_OK; - default: - return NOTIFY_DONE; } + + return NOTIFY_DONE; } static int __init migration_init(void) @@ -4739,6 +5270,8 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); + free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -4790,8 +5323,14 @@ static int init_rootdomain(struct root_domain *rd) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; + + init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; @@ -4799,6 +5338,8 @@ static int init_rootdomain(struct root_domain *rd) free_rto_mask: free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: @@ -6150,6 +6691,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); + init_sched_dl_class(); } #else void __init sched_init_smp(void) @@ -6219,13 +6761,15 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_rt_period(), global_rt_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6249,6 +6793,7 @@ void __init sched_init(void) rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); + init_dl_rq(&rq->dl, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6320,10 +6865,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -6397,13 +6938,16 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { const struct sched_class *prev_class = p->sched_class; + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + }; int old_prio = p->prio; int on_rq; on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); + __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); resched_task(rq->curr); @@ -6433,7 +6977,7 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!rt_task(p)) { + if (!dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -6628,16 +7172,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -6811,24 +7345,13 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; int ret = 0; - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -6851,17 +7374,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; + int i, ret = 0; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -6873,36 +7386,88 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; + return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int sched_dl_global_constraints(void) { - int ret; - static DEFINE_MUTEX(mutex); + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + int cpu, ret = 0; - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* make sure that internally we keep jiffies */ - /* also, writing zero resets timeslice to default */ - if (!ret && write) { - sched_rr_timeslice = sched_rr_timeslice <= 0 ? - RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock(&dl_b->lock); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock(&dl_b->lock); + + if (ret) + break; } - mutex_unlock(&mutex); + return ret; } +static void sched_dl_do_global(void) +{ + u64 new_bw = -1; + int cpu; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } +} + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; int old_period, old_runtime; static DEFINE_MUTEX(mutex); + int ret; mutex_lock(&mutex); old_period = sysctl_sched_rt_period; @@ -6911,21 +7476,50 @@ int sched_rt_handler(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } + if (ret) + goto undo; + + ret = sched_dl_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; } mutex_unlock(&mutex); return ret; } +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..045fc74e3f09 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,216 @@ +/* + * kernel/sched/cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli <j.lelli@sssup.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gfp.h> +#include <linux/kernel.h> +#include "cpudeadline.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a], cp->elements[b]); + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(cpu > num_present_cpus()); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->cpu_to_idx[cpu]; + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->cpu_to_idx[new_cpu] = old_idx; + cp->cpu_to_idx[cpu] = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->cpu_to_idx[cpu] = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + for (i = 0; i < NR_CPUS; i++) + cp->cpu_to_idx[i] = IDX_INVALID; + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + /* + * nothing to do for the moment + */ +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include <linux/sched.h> + +#define IDX_INVALID -1 + +struct array_item { + u64 dl; + int cpu; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + int cpu_to_idx[NR_CPUS]; + struct array_item elements[NR_CPUS]; + cpumask_var_t free_cpus; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..0de248202879 --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,1640 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, + * Juri Lelli <juri.lelli@gmail.com>, + * Michael Trimarchi <michael@amarulasolutions.com>, + * Fabio Checconi <fchecconi@gmail.com> + */ +#include "sched.h" + +#include <linux/slab.h> + +struct dl_bandwidth def_dl_bandwidth; + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ + return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + + return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ + return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ + struct sched_dl_entity *dl_se = &p->dl; + + return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ + raw_spin_lock_init(&dl_b->dl_runtime_lock); + dl_b->dl_period = period; + dl_b->dl_runtime = runtime; +} + +extern unsigned long to_ratio(u64 period, u64 runtime); + +void init_dl_bw(struct dl_bw *dl_b) +{ + raw_spin_lock_init(&dl_b->lock); + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); + if (global_rt_runtime() == RUNTIME_INF) + dl_b->bw = -1; + else + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + dl_b->total_bw = 0; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ + dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP + /* zero means no -deadline tasks */ + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + + dl_rq->dl_nr_migratory = 0; + dl_rq->overloaded = 0; + dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else + init_dl_bw(&dl_rq->dl_bw); +#endif +} + +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); + /* + * Must be visible before the overload count is + * set (as in sched_rt.c). + * + * Matched by the barrier in pull_dl_task(). + */ + smp_wmb(); + atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + atomic_dec(&rq->rd->dlo_count); + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (!dl_rq->overloaded) { + dl_set_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 1; + } + } else if (dl_rq->overloaded) { + dl_clear_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 0; + } +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total++; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory++; + + update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total--; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory--; + + update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node *parent = NULL; + struct task_struct *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct task_struct, + pushable_dl_tasks); + if (dl_entity_preempt(&p->dl, &entry->dl)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + + rb_link_node(&p->pushable_dl_tasks, parent, link); + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) + return; + + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + struct rb_node *next_node; + + next_node = rb_next(&p->pushable_dl_tasks); + dl_rq->pushable_dl_tasks_leftmost = next_node; + } + + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +#endif /* CONFIG_SMP */ + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + * - the absolute deadline of the entity has to be placed at + * current time + relative deadline; + * - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + + /* + * We use the regular wall clock time to set deadlines in the + * future; in fact, we must consider execution overheads (time + * spent on hardirq context, etc.). + */ + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + BUG_ON(pi_se->dl_runtime <= 0); + + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. + */ + if (dl_se->dl_deadline == 0) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } + + /* + * We keep moving the deadline away until we get some + * available runtime for the entity. This ensures correct + * handling of situations where the runtime overrun is + * arbitrary large. + */ + while (dl_se->runtime <= 0) { + dl_se->deadline += pi_se->dl_period; + dl_se->runtime += pi_se->dl_runtime; + } + + /* + * At this point, the deadline really should be "in + * the future" with respect to rq->clock. If it's + * not, we are, for some reason, lagging too much! + * Anyway, after having warn userspace abut that, + * we still try to keep the things running by + * resetting the deadline and the budget of the + * entity. + */ + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { + static bool lag_once = false; + + if (!lag_once) { + lag_once = true; + printk_sched("sched: DL replenish lagged to much\n"); + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks. + * + * This function returns true if: + * + * runtime / (deadline - t) > dl_runtime / dl_period , + * + * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, u64 t) +{ + u64 left, right; + + /* + * left and right are the two sides of the equation above, + * after a bit of shuffling to use multiplications instead + * of divisions. + * + * Note that none of the time values involved in the two + * multiplications are absolute: dl_deadline and dl_runtime + * are the relative deadline and the maximum runtime of each + * instance, runtime is the runtime left for the last instance + * and (deadline - t), since t is rq->clock, is the time left + * to the (absolute) deadline. Even if overflowing the u64 type + * is very unlikely to occur in both cases, here we scale down + * as we want to avoid that risk at all. Scaling down by 10 + * means that we reduce granularity to 1us. We are fine with it, + * since this is only a true/false check and, anyway, thinking + * of anything below microseconds resolution is actually fiction + * (but still we want to give the user that illusion >;). + */ + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + right = ((dl_se->deadline - t) >> DL_SCALE) * + (pi_se->dl_runtime >> DL_SCALE); + + return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * The arrival of a new instance needs special treatment, i.e., + * the actual scheduling parameters have to be "renewed". + */ + if (dl_se->dl_new) { + setup_new_dl_entity(dl_se, pi_se); + return; + } + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + ktime_t now, act; + ktime_t soft, hard; + unsigned long range; + s64 delta; + + if (boosted) + return 0; + /* + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. + */ + act = ns_to_ktime(dl_se->deadline); + now = hrtimer_cb_get_time(&dl_se->dl_timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); + + /* + * If the expiry time already passed, e.g., because the value + * chosen as the deadline is too small, don't even try to + * start the timer in the past! + */ + if (ktime_us_delta(act, now) < 0) + return 0; + + hrtimer_set_expires(&dl_se->dl_timer, act); + + soft = hrtimer_get_softexpires(&dl_se->dl_timer); + hard = hrtimer_get_expires(&dl_se->dl_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + dl_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + raw_spin_lock(&rq->lock); + + /* + * We need to take care of a possible races here. In fact, the + * task might have changed its scheduling policy to something + * different from SCHED_DEADLINE or changed its reservation + * parameters (through sched_setscheduler()). + */ + if (!dl_task(p) || dl_se->dl_new) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + dl_se->dl_throttled = 0; + if (p->on_rq) { + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif + } +unlock: + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->dl_timer; + + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); + int rorun = dl_se->runtime <= 0; + + if (!rorun && !dmiss) + return 0; + + /* + * If we are beyond our current deadline and we are still + * executing, then we have already used some of the runtime of + * the next instance. Thus, if we do not account that, we are + * stealing bandwidth from the system at each deadline miss! + */ + if (dmiss) { + dl_se->runtime = rorun ? dl_se->runtime : 0; + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; + } + + return 1; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + u64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + dl_se->runtime -= delta_exec; + if (dl_runtime_exceeded(rq, dl_se)) { + __dequeue_task_dl(rq, curr, 0); + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) + dl_se->dl_throttled = 1; + else + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + + if (!is_leftmost(curr, &rq->dl)) + resched_task(curr); + } + + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks + * would be able to exceed the shared quota. + * + * Account to the root rt group for now. + * + * The solution we're working towards is having the RT groups scheduled + * using deadline servers -- however there's a few nasties to figure + * out before that can happen. + */ + if (rt_bandwidth_enabled()) { + struct rt_rq *rt_rq = &rq->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + /* + * We'll let actual RT tasks worry about the overflow here, we + * have our own CBS to keep us inline -- see above. + */ + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } +} + +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + + if (next && dl_prio(next->prio)) + return next->dl.deadline; + else + return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + if (dl_rq->earliest_dl.curr == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + /* + * If the dl_rq had no -deadline tasks, or if the new task + * has shorter deadline than the current one on dl_rq, we + * know that the previous earliest becomes our next earliest, + * as the new task becomes the earliest itself. + */ + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; + dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + } else if (dl_rq->earliest_dl.next == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.next)) { + /* + * On the other hand, if the new -deadline task has a + * a later deadline than the earliest one on dl_rq, but + * it is earlier than the next (if any), we must + * recompute the next-earliest. + */ + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * Since we may have removed our earliest (and/or next earliest) + * task we must recompute them. + */ + if (!dl_rq->dl_nr_running) { + dl_rq->earliest_dl.curr = 0; + dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + } else { + struct rb_node *leftmost = dl_rq->rb_leftmost; + struct sched_dl_entity *entry; + + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); + dl_rq->earliest_dl.curr = entry->deadline; + dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + } +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + u64 deadline = dl_se->deadline; + + WARN_ON(!dl_prio(prio)); + dl_rq->dl_nr_running++; + + inc_dl_deadline(dl_rq, deadline); + inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + + WARN_ON(!dl_prio(prio)); + WARN_ON(!dl_rq->dl_nr_running); + dl_rq->dl_nr_running--; + + dec_dl_deadline(dl_rq, dl_se->deadline); + dec_dl_migration(dl_se, dl_rq); +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_dl_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_dl_entity, rb_node); + if (dl_time_before(dl_se->deadline, entry->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->rb_leftmost = &dl_se->rb_node; + + rb_link_node(&dl_se->rb_node, parent, link); + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + + inc_dl_tasks(dl_se, dl_rq); +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + if (RB_EMPTY_NODE(&dl_se->rb_node)) + return; + + if (dl_rq->rb_leftmost == &dl_se->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&dl_se->rb_node); + dl_rq->rb_leftmost = next_node; + } + + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + RB_CLEAR_NODE(&dl_se->rb_node); + + dec_dl_tasks(dl_se, dl_rq); +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, int flags) +{ + BUG_ON(on_dl_rq(dl_se)); + + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); + else + update_dl_entity(dl_se, pi_se); + + __enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + __dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct sched_dl_entity *pi_se = &p->dl; + + /* + * Use the scheduling parameters of the top pi-waiter + * task if we have one and its (relative) deadline is + * smaller than our one... OTW we keep our runtime and + * deadline. + */ + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + pi_se = &pi_task->dl; + + /* + * If p is throttled, we do nothing. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + */ + if (p->dl.dl_throttled) + return; + + enqueue_dl_entity(&p->dl, pi_se, flags); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); + + inc_nr_running(rq); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_dl_entity(&p->dl); + dequeue_pushable_dl_task(rq, p); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); + + dec_nr_running(rq); +} + +/* + * Yield task semantic for -deadline tasks is: + * + * get off from the CPU until our next instance, with + * a new runtime. This is of little use now, since we + * don't have a bandwidth reclaiming mechanism. Anyway, + * bandwidth reclaiming is planned for the future, and + * yield_task_dl will indicate that some spare budget + * is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * We make the task go to sleep until its current deadline by + * forcing its runtime to zero. This way, update_curr_dl() stops + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_new=1). + */ + if (p->dl.runtime > 0) { + rq->curr->dl.dl_new = 1; + p->dl.runtime = 0; + } + update_curr_dl(rq); +} + +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If we are dealing with a -deadline task, we must + * decide where to wake it up. + * If it has a later deadline and the current task + * on this rq can't move (provided the waking task + * can!) we prefer to send it somewhere else. On the + * other hand, if it has a shorter deadline, we + * try to make it stay here, it might be important. + */ + if (unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + (p->nr_cpus_allowed > 1)) { + int target = find_later_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + return; + + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + return; + + resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags) +{ + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * In the unlikely case current and p have the same deadline + * let us try to decide what's the best thing to do... + */ + if ((p->dl.deadline == rq->curr->dl.deadline) && + !test_tsk_need_resched(rq->curr)) + check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ + s64 delta = p->dl.dl_runtime - p->dl.runtime; + + if (delta > 10000) + hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, + struct dl_rq *dl_rq) +{ + struct rb_node *left = dl_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct sched_dl_entity *dl_se; + struct task_struct *p; + struct dl_rq *dl_rq; + + dl_rq = &rq->dl; + + if (unlikely(!dl_rq->dl_nr_running)) + return NULL; + + dl_se = pick_next_dl_entity(rq, dl_rq); + BUG_ON(!dl_se); + + p = dl_task_of(dl_se); + p->se.exec_start = rq_clock_task(rq); + + /* Running task will never be pushed. */ + dequeue_pushable_dl_task(rq, p); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); +#endif + +#ifdef CONFIG_SMP + rq->post_schedule = has_pushable_dl_tasks(rq); +#endif /* CONFIG_SMP */ + + return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ + update_curr_dl(rq); + + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ + /* + * SCHED_DEADLINE tasks cannot fork and this is achieved through + * sched_fork() + */ +} + +static void task_dead_dl(struct task_struct *p) +{ + struct hrtimer *timer = &p->dl.dl_timer; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + /* + * Since we are TASK_DEAD we won't slip out of the domain! + */ + raw_spin_lock_irq(&dl_b->lock); + dl_b->total_bw -= p->dl.dl_bw; + raw_spin_unlock_irq(&dl_b->lock); + + hrtimer_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + + return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.rb_leftmost; + struct sched_dl_entity *dl_se; + struct task_struct *p = NULL; + +next_node: + next_node = rb_next(next_node); + if (next_node) { + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); + p = dl_task_of(dl_se); + + if (pick_dl_task(rq, p, cpu)) + return p; + + goto next_node; + } + + return NULL; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + int this_cpu = smp_processor_id(); + int best_cpu, cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!later_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; + + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task, later_mask); + if (best_cpu == -1) + return -1; + + /* + * If we are here, some target has been found, + * the most suitable of which is cached in best_cpu. + * This is, among the runqueues where the current tasks + * have later deadlines than the task's one, the rq + * with the latest possible one. + * + * Now we check how well this matches with task's + * affinity and system topology. + * + * The last cpu where the task run is our first + * guess, since it is most likely cache-hot there. + */ + if (cpumask_test_cpu(cpu, later_mask)) + return cpu; + /* + * Check if this_cpu is to be skipped (i.e., it is + * not in the mask) or not. + */ + if (!cpumask_test_cpu(this_cpu, later_mask)) + this_cpu = -1; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + + /* + * If possible, preempting this_cpu is + * cheaper than migrating. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + /* + * Last chance: if best_cpu is valid and is + * in the mask, that becomes our choice. + */ + if (best_cpu < nr_cpu_ids && + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * At this point, all our guesses failed, we just return + * 'something', and let the caller sort the things out. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + + return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *later_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < DL_MAX_TRIES; tries++) { + cpu = find_later_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + later_rq = cpu_rq(cpu); + + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(later_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || !task->on_rq)) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; + } + } + + /* + * If the rq we found has no -deadline task, or + * its earliest one has a later deadline than our + * task, the rq is a good one. + */ + if (!later_rq->dl.dl_nr_running || + dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) + break; + + /* Otherwise we try again. */ + double_unlock_balance(rq, later_rq); + later_rq = NULL; + } + + return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!dl_task(p)); + + return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *later_rq; + + if (!rq->dl.overloaded) + return 0; + + next_task = pick_next_pushable_dl_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * If next_task preempts rq->curr, and rq->curr + * can move away, it makes sense to just reschedule + * without going further in pushing next_task. + */ + if (dl_task(rq->curr) && + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + rq->curr->nr_cpus_allowed > 1) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* Will lock the rq it'll find */ + later_rq = find_lock_later_rq(next_task, rq); + if (!later_rq) { + struct task_struct *task; + + /* + * We must check all this again, since + * find_lock_later_rq releases rq->lock and it is + * then possible that next_task has migrated. + */ + task = pick_next_pushable_dl_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task is still there. We don't try + * again, some other cpu will pull it when ready. + */ + dequeue_pushable_dl_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks */ + goto out; + + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, later_rq->cpu); + activate_task(later_rq, next_task, 0); + + resched_task(later_rq->curr); + + double_unlock_balance(rq, later_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ + /* Terminates as it moves a -deadline task */ + while (push_dl_task(rq)) + ; +} + +static int pull_dl_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + u64 dmin = LONG_MAX; + + if (likely(!dl_overloaded(this_rq))) + return 0; + + /* + * Match the barrier from dl_set_overloaded; this guarantees that if we + * see overloaded we must also see the dlo_mask bit. + */ + smp_rmb(); + + for_each_cpu(cpu, this_rq->rd->dlo_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * It looks racy, abd it is! However, as in sched_rt.c, + * we are fine with this. + */ + if (this_rq->dl.dl_nr_running && + dl_time_before(this_rq->dl.earliest_dl.curr, + src_rq->dl.earliest_dl.next)) + continue; + + /* Might drop this_rq->lock */ + double_lock_balance(this_rq, src_rq); + + /* + * If there are no more pullable tasks on the + * rq, we're done with it. + */ + if (src_rq->dl.dl_nr_running <= 1) + goto skip; + + p = pick_next_earliest_dl_task(src_rq, this_cpu); + + /* + * We found a task to be pulled if: + * - it preempts our current (if there's one), + * - it will preempt the last one we pulled (if any). + */ + if (p && dl_time_before(p->dl.deadline, dmin) && + (!this_rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + this_rq->dl.earliest_dl.curr))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * Then we pull iff p has actually an earlier + * deadline than the current task of its runqueue. + */ + if (dl_time_before(p->dl.deadline, + src_rq->curr->dl.deadline)) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + + /* Is there any other task even earlier? */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull other tasks here */ + if (dl_task(prev)) + pull_dl_task(rq); +} + +static void post_schedule_dl(struct rq *rq) +{ + push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_dl_tasks(rq) && + p->nr_cpus_allowed > 1 && + dl_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + dl_entity_preempt(&rq->curr->dl, &p->dl))) { + push_dl_tasks(rq); + } +} + +static void set_cpus_allowed_dl(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!dl_task(p)); + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_dl_task(rq, p); + BUG_ON(!rq->dl.dl_nr_migratory); + rq->dl.dl_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_dl_task(rq, p); + rq->dl.dl_nr_migratory++; + } + + update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +} + +void init_sched_dl_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), + GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) + hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP + /* + * Since this might be the only -deadline task on the rq, + * this is the right place to try to pull some other one + * from an overloaded cpu, if any. + */ + if (!rq->dl.dl_nr_running) + pull_dl_task(rq); +#endif +} + +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If p is throttled, don't consider the possibility + * of preempting rq->curr, the check will be done right + * after its runtime will get replenished. + */ + if (unlikely(p->dl.dl_throttled)) + return; + + if (p->on_rq || rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + /* Only reschedule if pushing failed */ + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + } +} + +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ +static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) +{ + if (p->on_rq || rq->curr == p) { +#ifdef CONFIG_SMP + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + pull_dl_task(rq); + + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this + * runqueue. + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && + rq->curr == p) + resched_task(p); +#else + /* + * Again, we don't know if p has a earlier + * or later deadline, so let's blindly set a + * (maybe not needed) rescheduling point. + */ + resched_task(p); +#endif /* CONFIG_SMP */ + } else + switched_to_dl(rq, p); +} + +const struct sched_class dl_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_dl, + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + + .check_preempt_curr = check_preempt_curr_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .pre_schedule = pre_schedule_dl, + .post_schedule = post_schedule_dl, + .task_woken = task_woken_dl, +#endif + + .set_curr_task = set_curr_task_dl, + .task_tick = task_tick_dl, + .task_fork = task_fork_dl, + .task_dead = task_dead_dl, + + .prio_changed = prio_changed_dl, + .switched_from = switched_from_dl, + .switched_to = switched_to_dl, +}; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); + SEQ_printf(m, " %d", task_node(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); @@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) PN(cpu_clk); P(jiffies); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); + P(sched_clock_stable()); #endif #undef PN #undef P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e64b0794060e..b24b6cfde9aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -/* - * Once a preferred node is selected the scheduler balancer will prefer moving - * a task to that node for sysctl_numa_balancing_settle_count number of PTE - * scans. This will give the process the chance to accumulate more faults on - * the preferred node but still allow the scheduler to move the task again if - * the nodes CPUs are overloaded. - */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); @@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; } /* @@ -1023,7 +1015,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; - int imbalance_pct, idx; + int imbalance_pct; struct task_struct *best_task; long best_imp; @@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); + p->numa_preferred_nid = task_node(p); return -EINVAL; } @@ -1278,7 +1270,7 @@ static void numa_migrate_preferred(struct task_struct *p) p->numa_migrate_retry = jiffies + HZ; /* Success if task is already running on preferred CPU */ - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + if (task_node(p) == p->numa_preferred_nid) return; /* Otherwise, try migrate to a CPU on the preferred node */ @@ -1350,7 +1342,6 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running; unsigned long load; int i; @@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += nr_running; + sgs->sum_nr_running += rq->nr_running; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -6521,7 +6509,7 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; @@ -6652,10 +6640,10 @@ void update_max_interval(void) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; - struct rq *rq = cpu_rq(cpu); + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -6752,9 +6740,9 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) update_idle_cpu_load(rq); raw_spin_unlock_irq(&rq->lock); - rebalance_domains(balance_cpu, CPU_IDLE); + rebalance_domains(rq, CPU_IDLE); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -6800,14 +6788,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_power *sgp; - int nr_busy; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -6856,7 +6844,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); + nohz_idle_balance(this_rq, idle); } -static inline int on_null_domain(int cpu) +static inline int on_null_domain(struct rq *rq) { - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(rq->sd); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); + if (nohz_kick_needed(rq)) + nohz_balancer_kick(); #endif } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..a2740b775b45 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1738,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && - rt_task(rq->curr) && + (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..c2119fd20f8b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -9,6 +10,7 @@ #include <linux/slab.h> #include "cpupri.h" +#include "cpudeadline.h" #include "cpuacct.h" struct rq; @@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); #define NICE_0_SHIFT SCHED_LOAD_SHIFT /* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + +/* * These are the 'tuning knobs' of the scheduler: */ @@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); */ #define RUNTIME_INF ((u64)~0ULL) +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + static inline int rt_policy(int policy) { - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; } static inline int task_has_rt_policy(struct task_struct *p) @@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -108,6 +144,47 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; extern struct mutex sched_domains_mutex; @@ -364,6 +441,42 @@ struct rt_rq { #endif }; +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + unsigned long dl_nr_total; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; +#endif +}; + #ifdef CONFIG_SMP /* @@ -382,6 +495,15 @@ struct root_domain { cpumask_var_t online; /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + + /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ @@ -432,6 +554,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -827,8 +950,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } - - static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; @@ -988,6 +1109,7 @@ static const u32 prio_to_wmult[40] = { #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_REPLENISH 8 #define DEQUEUE_SLEEP 1 @@ -1023,6 +1145,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); @@ -1042,6 +1165,7 @@ struct sched_class { for (class = sched_class_highest; class; class = class->next) extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -1051,7 +1175,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq, int cpu); +extern void trigger_load_balance(struct rq *rq); extern void idle_balance(int this_cpu, struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); @@ -1068,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); + +extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); @@ -1077,6 +1204,12 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); + +unsigned long to_ratio(u64 period, u64 runtime); + extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); @@ -1353,6 +1486,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &rt_sched_class, + .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/softirq.c b/kernel/softirq.c index 9a4500e4c189..8b93b3770f85 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -89,7 +89,7 @@ static void wakeup_softirqd(void) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip, unsigned int cnt) +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -107,33 +107,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) /* * Were softirqs turned off above: */ - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_off(ip); raw_local_irq_restore(flags); if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - preempt_count_add(cnt); - barrier(); -} +EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -void local_bh_disable(void) -{ - __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(local_bh_disable); - static void __local_bh_enable(unsigned int cnt) { WARN_ON_ONCE(!irqs_disabled()); - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); preempt_count_sub(cnt); } @@ -151,7 +139,7 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS @@ -166,7 +154,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); + preempt_count_sub(cnt - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) { /* @@ -182,18 +170,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip(_RET_IP_); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); +EXPORT_SYMBOL(__local_bh_enable_ip); /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, @@ -264,7 +241,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); cpu = smp_processor_id(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..c8da99f905cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -385,13 +385,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "numa_balancing_settle_count", - .data = &sysctl_numa_balancing_settle_count, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "numa_balancing_migrate_deferred", .data = &sysctl_numa_balancing_migrate_deferred, .maxlen = sizeof(unsigned int), diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..c833249ab0fb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -177,7 +177,7 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) { + if (!sched_clock_stable()) { trace_tick_stop(0, "unstable sched clock\n"); /* * Don't allow the user to think they can get diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..294b8a271a04 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (unlikely(test_time_stamp(delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; + local_clock_stable = sched_clock_stable(); #endif WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <trace/events/sched.h> #include "trace.h" @@ -27,6 +28,8 @@ static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; + tracing_dl = 0; if (wakeup_task) put_task_struct(wakeup_task); @@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; pc = preempt_count(); @@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) arch_spin_lock(&wakeup_lock); /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) goto out_locked; /* reset the trace */ @@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + wakeup_task = p; get_task_struct(wakeup_task); @@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; @@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = .use_max_tr = true, }; +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, +}; + __init static int init_wakeup_tracer(void) { int ret; @@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + return 0; } core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; struct completion *x = data; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_setattr(current, &attr); /* Make it know we have a new prio */ complete(x); @@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* - * This is an RT task, do short sleeps to let - * others run. + * This will likely be the system top priority + * task, do short sleeps to let others run. */ msleep(100); } @@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tracing_max_latency; struct task_struct *p; - struct completion isrt; + struct completion is_ready; unsigned long count; int ret; - init_completion(&isrt); + init_completion(&is_ready); - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) while (p->on_rq) { /* - * Sleep to make sure the RT thread is asleep too. + * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, * but we want to make sure this test still works. */ msleep(100); } - init_completion(&isrt); + init_completion(&is_ready); wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&isrt); + wait_for_completion(&is_ready); /* stop the tracing. */ tracing_stop(); |