From 86506a99a62400e9f7b7d1344bcc9ea235faf98f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Nov 2013 15:36:14 +0100 Subject: tasks/exit: Remove unused task_is_dead() method task_is_dead() has no users since commit 43e13cc107cf ("cred: remove task_is_dead() from __task_cred() validation"), and nobody except exit.c should rely on ->exit_state (we still have the users which should be changed). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: David Laight Cc: Geert Uytterhoeven Cc: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20131113143614.GA10547@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7e35d4b9e14a..cda8d634bc47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -168,7 +168,6 @@ extern char ___assert_task_state[1 - 2*!!( #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ -- cgit v1.2.3 From 1bd53a7efdc988163ec4c25f656df38dbe500632 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 12 Dec 2013 15:23:23 +0800 Subject: sched/numa: Drop sysctl_numa_balancing_settle_count sysctl commit 887c290e (sched/numa: Decide whether to favour task or group weights based on swap candidate relationships) drop the check against sysctl_numa_balancing_settle_count, this patch remove the sysctl. Signed-off-by: Wanpeng Li Acked-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: David Rientjes Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Naoya Horiguchi Link: http://lkml.kernel.org/r/1386833006-6600-1-git-send-email-liwanp@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 41467f8ff8ec..31e0193cb0c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,7 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; -extern unsigned int sysctl_numa_balancing_settle_count; #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; -- cgit v1.2.3 From d50dde5a10f305253cbc3855307f608f8a3c5f73 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:36 +0100 Subject: sched: Add new scheduler syscalls to support an extended scheduling parameters ABI Add the syscalls needed for supporting scheduling algorithms with extended scheduling parameters (e.g., SCHED_DEADLINE). In general, it makes possible to specify a periodic/sporadic task, that executes for a given amount of runtime at each instance, and is scheduled according to the urgency of their own timing constraints, i.e.: - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Thus, both the data structure that holds the scheduling parameters of the tasks and the system calls dealing with it must be extended. Unfortunately, modifying the existing struct sched_param would break the ABI and result in potentially serious compatibility issues with legacy binaries. For these reasons, this patch: - defines the new struct sched_attr, containing all the fields that are necessary for specifying a task in the computational model described above; - defines and implements the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr(). Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a proof of concept and for developing and testing purposes. Making them available on other architectures is straightforward. Since no "user" for these new parameters is introduced in this patch, the implementation of the new system calls is just identical to their already existing counterpart. Future patches that implement scheduling policies able to exploit the new data structure must also take care of modifying the sched_*attr() calls accordingly with their own purposes. Signed-off-by: Dario Faggioli [ Rewrote to use sched_attr. ] Signed-off-by: Juri Lelli [ Removed sched_setscheduler2() for now. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 6 +++++ 2 files changed, 68 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3a1e9857b393..86025b6c6387 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -56,6 +56,66 @@ struct sched_param { #include +#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + +/* + * Extended scheduling parameters data structure. + * + * This is needed because the original struct sched_param can not be + * altered without introducing ABI issues with legacy applications + * (e.g., in sched_getparam()). + * + * However, the possibility of specifying more than just a priority for + * the tasks may be useful for a wide variety of application fields, e.g., + * multimedia, streaming, automation and control, and many others. + * + * This variant (sched_attr) is meant at describing a so-called + * sporadic time-constrained task. In such model a task is specified by: + * - the activation period or minimum instance inter-arrival time; + * - the maximum (or average, depending on the actual scheduling + * discipline) computation time of all instances, a.k.a. runtime; + * - the deadline (relative to the actual activation time) of each + * instance. + * Very briefly, a periodic (sporadic) task asks for the execution of + * some specific computation --which is typically called an instance-- + * (at most) every period. Moreover, each instance typically lasts no more + * than the runtime and must be completed by time instant t equal to + * the instance activation time + the deadline. + * + * This is reflected by the actual fields of the sched_attr structure: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_flags for customizing the scheduler behaviour + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * @sched_deadline representative of the task's deadline + * @sched_runtime representative of the task's runtime + * @sched_period representative of the task's period + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their + * timing constraints. + */ +struct sched_attr { + u32 size; + + u32 sched_policy; + u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + u32 sched_priority; + + /* SCHED_DEADLINE */ + u64 sched_runtime; + u64 sched_deadline; + u64 sched_period; +}; + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -1958,6 +2018,8 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, + const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 94273bbe6050..40ed9e9a77e5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -38,6 +38,7 @@ struct rlimit; struct rlimit64; struct rusage; struct sched_param; +struct sched_attr; struct sel_arg_struct; struct semaphore; struct sembuf; @@ -279,9 +280,14 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param); asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_setattr(pid_t pid, + struct sched_attr __user *attr); asmlinkage long sys_sched_getscheduler(pid_t pid); asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param); +asmlinkage long sys_sched_getattr(pid_t pid, + struct sched_attr __user *attr, + unsigned int size); asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr); asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, -- cgit v1.2.3 From aab03e05e8f7e26f51dee792beddcb5cca9215a5 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 28 Nov 2013 11:14:43 +0100 Subject: sched/deadline: Add SCHED_DEADLINE structures & implementation Introduces the data structures, constants and symbols needed for SCHED_DEADLINE implementation. Core data structure of SCHED_DEADLINE are defined, along with their initializers. Hooks for checking if a task belong to the new policy are also added where they are needed. Adds a scheduling class, in sched/dl.c and a new policy called SCHED_DEADLINE. It is an implementation of the Earliest Deadline First (EDF) scheduling algorithm, augmented with a mechanism (called Constant Bandwidth Server, CBS) that makes it possible to isolate the behaviour of tasks between each other. The typical -deadline task will be made up of a computation phase (instance) which is activated on a periodic or sporadic fashion. The expected (maximum) duration of such computation is called the task's runtime; the time interval by which each instance need to be completed is called the task's relative deadline. The task's absolute deadline is dynamically calculated as the time instant a task (better, an instance) activates plus the relative deadline. The EDF algorithms selects the task with the smallest absolute deadline as the one to be executed first, while the CBS ensures each task to run for at most its runtime every (relative) deadline length time interval, avoiding any interference between different tasks (bandwidth isolation). Thanks to this feature, also tasks that do not strictly comply with the computational model sketched above can effectively use the new policy. To summarize, this patch: - introduces the data structures, constants and symbols needed; - implements the core logic of the scheduling algorithm in the new scheduling class file; - provides all the glue code between the new scheduling class and the core scheduler and refines the interactions between sched/dl and the other existing scheduling classes. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Signed-off-by: Fabio Checconi Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 +++++++++++++++++++++++++++++++++++++++++- include/linux/sched/deadline.h | 24 ++++++++++++++++++++++ include/uapi/linux/sched.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched/deadline.h (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 86025b6c6387..6c196794fc12 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -97,6 +97,10 @@ struct sched_param { * Given this task model, there are a multiplicity of scheduling algorithms * and policies, that can be used to ensure all the tasks will make their * timing constraints. + * + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the + * only user of this new interface. More information about the algorithm + * available in the scheduling class file or in Documentation/. */ struct sched_attr { u32 size; @@ -1088,6 +1092,45 @@ struct sched_rt_entity { #endif }; +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setscheduler2(), they will remain the same until + * the next sched_setscheduler2(). + */ + u64 dl_runtime; /* maximum runtime for each instance */ + u64 dl_deadline; /* relative deadline of each instance */ + + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* remaining runtime for this instance */ + u64 deadline; /* absolute deadline for this instance */ + unsigned int flags; /* specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_new tells if a new instance arrived. If so we must + * start executing it with full runtime and reset its absolute + * deadline; + */ + int dl_throttled, dl_new; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; +}; struct rcu_node; @@ -1124,6 +1167,7 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ @@ -2099,7 +2143,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h new file mode 100644 index 000000000000..9d303b8847df --- /dev/null +++ b/include/linux/sched/deadline.h @@ -0,0 +1,24 @@ +#ifndef _SCHED_DEADLINE_H +#define _SCHED_DEADLINE_H + +/* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and + * NORMAL/BATCH tasks. + */ + +#define MAX_DL_PRIO 0 + +static inline int dl_prio(int prio) +{ + if (unlikely(prio < MAX_DL_PRIO)) + return 1; + return 0; +} + +static inline int dl_task(struct task_struct *p) +{ + return dl_prio(p->prio); +} + +#endif /* _SCHED_DEADLINE_H */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5a0f945927ac..2d5e49a2a6d2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,6 +39,7 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_DEADLINE 6 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 -- cgit v1.2.3 From 1baca4ce16b8cc7d4f50be1f7914799af30a2861 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 7 Nov 2013 14:43:38 +0100 Subject: sched/deadline: Add SCHED_DEADLINE SMP-related data structures & logic Introduces data structures relevant for implementing dynamic migration of -deadline tasks and the logic for checking if runqueues are overloaded with -deadline tasks and for choosing where a task should migrate, when it is the case. Adds also dynamic migrations to SCHED_DEADLINE, so that tasks can be moved among CPUs when necessary. It is also possible to bind a task to a (set of) CPU(s), thus restricting its capability of migrating, or forbidding migrations at all. The very same approach used in sched_rt is utilised: - -deadline tasks are kept into CPU-specific runqueues, - -deadline tasks are migrated among runqueues to achieve the following: * on an M-CPU system the M earliest deadline ready tasks are always running; * affinity/cpusets settings of all the -deadline tasks is always respected. Therefore, this very special form of "load balancing" is done with an active method, i.e., the scheduler pushes or pulls tasks between runqueues when they are woken up and/or (de)scheduled. IOW, every time a preemption occurs, the descheduled task might be sent to some other CPU (depending on its deadline) to continue executing (push). On the other hand, every time a CPU becomes idle, it might pull the second earliest deadline ready task from some other CPU. To enforce this, a pull operation is always attempted before taking any scheduling decision (pre_schedule()), as well as a push one after each scheduling decision (post_schedule()). In addition, when a task arrives or wakes up, the best CPU where to resume it is selected taking into account its affinity mask, the system topology, but also its deadline. E.g., from the scheduling point of view, the best CPU where to wake up (and also where to push) a task is the one which is running the task with the latest deadline among the M executing ones. In order to facilitate these decisions, per-runqueue "caching" of the deadlines of the currently running and of the first ready task is used. Queued but not running tasks are also parked in another rb-tree to speed-up pushes. Signed-off-by: Juri Lelli Signed-off-by: Dario Faggioli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-5-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6c196794fc12..cc66f2615a6d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1201,6 +1201,7 @@ struct task_struct { struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm, *active_mm; -- cgit v1.2.3 From 755378a47192a3d1f7c3a8ca6c15c1cf76de0af2 Mon Sep 17 00:00:00 2001 From: Harald Gustafsson Date: Thu, 7 Nov 2013 14:43:40 +0100 Subject: sched/deadline: Add period support for SCHED_DEADLINE tasks Make it possible to specify a period (different or equal than deadline) for -deadline tasks. Relative deadlines (D_i) are used on task arrivals to generate new scheduling (absolute) deadlines as "d = t + D_i", and periods (P_i) to postpone the scheduling deadlines as "d = d + P_i" when the budget is zero. This is in general useful to model (and schedule) tasks that have slow activation rates (long periods), but have to be scheduled soon once activated (short deadlines). Signed-off-by: Harald Gustafsson Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-7-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index cc66f2615a6d..158f4c2dd852 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1102,6 +1102,7 @@ struct sched_dl_entity { */ u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ + u64 dl_period; /* separation of two instances (period) */ /* * Actual scheduling parameters. Initialized with the values above, -- cgit v1.2.3 From fb00aca474405f4fa8a8519c3179fed722eabd83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2013 14:43:43 +0100 Subject: rtmutex: Turn the plist into an rb-tree Turn the pi-chains from plist to rb-tree, in the rt_mutex code, and provide a proper comparison function for -deadline and -priority tasks. This is done mainly because: - classical prio field of the plist is just an int, which might not be enough for representing a deadline; - manipulating such a list would become O(nr_deadline_tasks), which might be to much, as the number of -deadline task increases. Therefore, an rb-tree is used, and tasks are queued in it according to the following logic: - among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the one with the higher (lower, actually!) prio wins; - among a -priority and a -deadline task, the latter always wins; - among two -deadline tasks, the one with the earliest deadline wins. Queueing and dequeueing functions are changed accordingly, for both the list of a task's pi-waiters and the list of tasks blocked on a pi-lock. Signed-off-by: Peter Zijlstra Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-again-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 ++++++++++ include/linux/rtmutex.h | 18 ++++++------------ include/linux/sched.h | 4 +++- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b0ed422e4e4a..f0e52383a001 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -154,6 +155,14 @@ extern struct task_group root_task_group; #define INIT_TASK_COMM "swapper" +#ifdef CONFIG_RT_MUTEXES +# define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = RB_ROOT, \ + .pi_waiters_leftmost = NULL, +#else +# define INIT_RT_MUTEXES(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -221,6 +230,7 @@ extern struct task_group root_task_group; INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ INIT_CPUSET_SEQ(tsk) \ + INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ } diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index de17134244f3..3aed8d737e1a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -13,7 +13,7 @@ #define __LINUX_RT_MUTEX_H #include -#include +#include #include extern int max_lock_depth; /* for sysctl */ @@ -22,12 +22,14 @@ extern int max_lock_depth; /* for sysctl */ * The rt_mutex structure * * @wait_lock: spinlock to protect the structure - * @wait_list: pilist head to enqueue waiters in priority order + * @waiters: rbtree root to enqueue waiters in priority order + * @waiters_leftmost: top waiter * @owner: the mutex owner */ struct rt_mutex { raw_spinlock_t wait_lock; - struct plist_head wait_list; + struct rb_root waiters; + struct rb_node *waiters_leftmost; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; @@ -66,7 +68,7 @@ struct hrtimer_sleeper; #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ + , .waiters = RB_ROOT \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} @@ -98,12 +100,4 @@ extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); -#ifdef CONFIG_RT_MUTEXES -# define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ - INIT_RT_MUTEX_DEBUG(tsk) -#else -# define INIT_RT_MUTEXES(tsk) -#endif - #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 158f4c2dd852..9ea15019a5b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,6 +16,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1354,7 +1355,8 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; + struct rb_root pi_waiters; + struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; #endif -- cgit v1.2.3 From 2d3d891d3344159d5b452a645e355bbe29591e8b Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:44 +0100 Subject: sched/deadline: Add SCHED_DEADLINE inheritance logic Some method to deal with rt-mutexes and make sched_dl interact with the current PI-coded is needed, raising all but trivial issues, that needs (according to us) to be solved with some restructuring of the pi-code (i.e., going toward a proxy execution-ish implementation). This is under development, in the meanwhile, as a temporary solution, what this commits does is: - ensure a pi-lock owner with waiters is never throttled down. Instead, when it runs out of runtime, it immediately gets replenished and it's deadline is postponed; - the scheduling parameters (relative deadline and default runtime) used for that replenishments --during the whole period it holds the pi-lock-- are the ones of the waiting task with earliest deadline. Acting this way, we provide some kind of boosting to the lock-owner, still by using the existing (actually, slightly modified by the previous commit) pi-architecture. We would stress the fact that this is only a surely needed, all but clean solution to the problem. In the end it's only a way to re-start discussion within the community. So, as always, comments, ideas, rants, etc.. are welcome! :-) Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli [ Added !RT_MUTEXES build fix. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-11-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- include/linux/sched/rt.h | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ea15019a5b6..13c53a99920f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1124,8 +1124,12 @@ struct sched_dl_entity { * @dl_new tells if a new instance arrived. If so we must * start executing it with full runtime and reset its absolute * deadline; + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section). */ - int dl_throttled, dl_new; + int dl_throttled, dl_new, dl_boosted; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -1359,6 +1363,8 @@ struct task_struct { struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; + /* Top pi_waiters task */ + struct task_struct *pi_top_task; #endif #ifdef CONFIG_DEBUG_MUTEXES diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 440434df3627..34e4ebea8fce 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -35,6 +35,7 @@ static inline int rt_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); +extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { @@ -45,6 +46,10 @@ static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + return NULL; +} # define rt_mutex_adjust_pi(p) do { } while (0) static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { -- cgit v1.2.3 From 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 7 Nov 2013 14:43:45 +0100 Subject: sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks In order of deadline scheduling to be effective and useful, it is important that some method of having the allocation of the available CPU bandwidth to tasks and task groups under control. This is usually called "admission control" and if it is not performed at all, no guarantee can be given on the actual scheduling of the -deadline tasks. Since when RT-throttling has been introduced each task group have a bandwidth associated to itself, calculated as a certain amount of runtime over a period. Moreover, to make it possible to manipulate such bandwidth, readable/writable controls have been added to both procfs (for system wide settings) and cgroupfs (for per-group settings). Therefore, the same interface is being used for controlling the bandwidth distrubution to -deadline tasks and task groups, i.e., new controls but with similar names, equivalent meaning and with the same usage paradigm are added. However, more discussion is needed in order to figure out how we want to manage SCHED_DEADLINE bandwidth at the task group level. Therefore, this patch adds a less sophisticated, but actually very sensible, mechanism to ensure that a certain utilization cap is not overcome per each root_domain (the single rq for !SMP configurations). Another main difference between deadline bandwidth management and RT-throttling is that -deadline tasks have bandwidth on their own (while -rt ones doesn't!), and thus we don't need an higher level throttling mechanism to enforce the desired bandwidth. This patch, therefore: - adds system wide deadline bandwidth management by means of: * /proc/sys/kernel/sched_dl_runtime_us, * /proc/sys/kernel/sched_dl_period_us, that determine (i.e., runtime / period) the total bandwidth available on each CPU of each root_domain for -deadline tasks; - couples the RT and deadline bandwidth management, i.e., enforces that the sum of how much bandwidth is being devoted to -rt -deadline tasks to stay below 100%. This means that, for a root_domain comprising M CPUs, -deadline tasks can be created until the sum of their bandwidths stay below: M * (sched_dl_runtime_us / sched_dl_period_us) It is also possible to disable this bandwidth management logic, and be thus free of oversubscribing the system up to any arbitrary level. Signed-off-by: Dario Faggioli Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1383831828-15501-12-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 13c53a99920f..a196cb7fc6f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1104,6 +1104,7 @@ struct sched_dl_entity { u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ u64 dl_period; /* separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 31e0193cb0c5..8070a83dbedc 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -81,6 +81,15 @@ static inline unsigned int get_sysctl_timer_migration(void) extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +/* + * control SCHED_DEADLINE reservations: + * + * /proc/sys/kernel/sched_dl_period_us + * /proc/sys/kernel/sched_dl_runtime_us + */ +extern unsigned int sysctl_sched_dl_period; +extern int sysctl_sched_dl_runtime; + #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -99,4 +108,8 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int sched_dl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #endif /* _SCHED_SYSCTL_H */ -- cgit v1.2.3 From 1724813d9f2c7ff702b46d3e4a4f6d9b10a8f8c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Dec 2013 12:44:49 +0100 Subject: sched/deadline: Remove the sysctl_sched_dl knobs Remove the deadline specific sysctls for now. The problem with them is that the interaction with the exisiting rt knobs is nearly impossible to get right. The current (as per before this patch) situation is that the rt and dl bandwidth is completely separate and we enforce rt+dl < 100%. This is undesirable because this means that the rt default of 95% leaves us hardly any room, even though dl tasks are saver than rt tasks. Another proposed solution was (a discarted patch) to have the dl bandwidth be a fraction of the rt bandwidth. This is highly confusing imo. Furthermore neither proposal is consistent with the situation we actually want; which is rt tasks ran from a dl server. In which case the rt bandwidth is a direct subset of dl. So whichever way we go, the introduction of dl controls at this point is painful. Therefore remove them and instead share the rt budget. This means that for now the rt knobs are used for dl admission control and the dl runtime is accounted against the rt runtime. I realise that this isn't entirely desirable either; but whatever we do we appear to need to change the interface later, so better have a small interface for now. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zpyqbqds1r0vyxtxza1e7rdc@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 8070a83dbedc..31e0193cb0c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -81,15 +81,6 @@ static inline unsigned int get_sysctl_timer_migration(void) extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; -/* - * control SCHED_DEADLINE reservations: - * - * /proc/sys/kernel/sched_dl_period_us - * /proc/sys/kernel/sched_dl_runtime_us - */ -extern unsigned int sysctl_sched_dl_period; -extern int sysctl_sched_dl_runtime; - #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif @@ -108,8 +99,4 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -int sched_dl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - #endif /* _SCHED_SYSCTL_H */ -- cgit v1.2.3 From 9ea4c380066fbe23fe0da7f4abfabc444f2467f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: locking: Optimize lock_bh functions Currently all _bh_ lock functions do two preempt_count operations: local_bh_disable(); preempt_disable(); and for the unlock: preempt_enable_no_resched(); local_bh_enable(); Since its a waste of perfectly good cycles to modify the same variable twice when you can do it in one go; use the new __local_bh_{dis,en}able_ip() functions that allow us to provide a preempt_count value to add/sub. So define SOFTIRQ_LOCK_OFFSET as the offset a _bh_ lock needs to add/sub to be done in one go. As a bonus it gets rid of the preempt_enable_no_resched() usage. This reduces a 1000 loops of: spin_lock_bh(&bh_lock); spin_unlock_bh(&bh_lock); from 53596 cycles to 51995 cycles. I didn't do enough measurements to say for absolute sure that the result is significant but the the few runs I did for each suggest it is so. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: rui.zhang@intel.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/preempt_mask.h | 15 +++++++++++++++ include/linux/rwlock_api_smp.h | 12 ++++-------- include/linux/spinlock_api_smp.h | 12 ++++-------- include/linux/spinlock_api_up.h | 16 +++++++++++----- 4 files changed, 34 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h index d169820203dd..b8d96bca4357 100644 --- a/include/linux/preempt_mask.h +++ b/include/linux/preempt_mask.h @@ -78,6 +78,21 @@ # define PREEMPT_CHECK_OFFSET 0 #endif +/* + * The preempt_count offset needed for things like: + * + * spin_lock_bh() + * + * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and + * softirqs, such that unlock sequences of: + * + * spin_unlock(); + * local_bh_enable(); + * + * Work as expected. + */ +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) + /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 9c9f0495d37c..5b9b84b20407 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -172,8 +172,7 @@ static inline void __raw_read_lock_irq(rwlock_t *lock) static inline void __raw_read_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); } @@ -200,8 +199,7 @@ static inline void __raw_write_lock_irq(rwlock_t *lock) static inline void __raw_write_lock_bh(rwlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } @@ -250,8 +248,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_read_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, @@ -275,8 +272,7 @@ static inline void __raw_write_unlock_bh(rwlock_t *lock) { rwlock_release(&lock->dep_map, 1, _RET_IP_); do_raw_write_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } #endif /* __LINUX_RWLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index bdb9993f0fda..42dfab89e740 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -131,8 +131,7 @@ static inline void __raw_spin_lock_irq(raw_spinlock_t *lock) static inline void __raw_spin_lock_bh(raw_spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); } @@ -174,20 +173,17 @@ static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock) { spin_release(&lock->dep_map, 1, _RET_IP_); do_raw_spin_unlock(lock); - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) { - local_bh_disable(); - preempt_disable(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); if (do_raw_spin_trylock(lock)) { spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); return 1; } - preempt_enable_no_resched(); - local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); return 0; } diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index af1f47229e70..d0d188861ad6 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -24,11 +24,14 @@ * flags straight, to suppress compiler warnings of unused lock * variables, and to add the proper checker annotations: */ +#define ___LOCK(lock) \ + do { __acquire(lock); (void)(lock); } while (0) + #define __LOCK(lock) \ - do { preempt_disable(); __acquire(lock); (void)(lock); } while (0) + do { preempt_disable(); ___LOCK(lock); } while (0) #define __LOCK_BH(lock) \ - do { local_bh_disable(); __LOCK(lock); } while (0) + do { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); ___LOCK(lock); } while (0) #define __LOCK_IRQ(lock) \ do { local_irq_disable(); __LOCK(lock); } while (0) @@ -36,12 +39,15 @@ #define __LOCK_IRQSAVE(lock, flags) \ do { local_irq_save(flags); __LOCK(lock); } while (0) +#define ___UNLOCK(lock) \ + do { __release(lock); (void)(lock); } while (0) + #define __UNLOCK(lock) \ - do { preempt_enable(); __release(lock); (void)(lock); } while (0) + do { preempt_enable(); ___UNLOCK(lock); } while (0) #define __UNLOCK_BH(lock) \ - do { preempt_enable_no_resched(); local_bh_enable(); \ - __release(lock); (void)(lock); } while (0) + do { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_LOCK_OFFSET); \ + ___UNLOCK(lock); } while (0) #define __UNLOCK_IRQ(lock) \ do { local_irq_enable(); __UNLOCK(lock); } while (0) -- cgit v1.2.3 From 62b94a08da1bae9d187d49dfcd6665af393750f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 16:52:19 +0100 Subject: sched/preempt: Take away preempt_enable_no_resched() from modules Discourage drivers/modules to be creative with preemption. Sadly all is implemented in macros and inline so if they want to do evil they still can, but at least try and discourage some. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Eliezer Tamir Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Rusty Russell Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-fn7h6vu8wtgxk0ih402qcijx@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 22 ++++++++++++++++++++-- include/linux/uaccess.h | 5 ++++- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index a3d9dc8c2c00..dd9ddf8af205 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -64,7 +64,11 @@ do { \ } while (0) #else -#define preempt_enable() preempt_enable_no_resched() +#define preempt_enable() \ +do { \ + barrier(); \ + preempt_count_dec(); \ +} while (0) #define preempt_check_resched() do { } while (0) #endif @@ -93,7 +97,11 @@ do { \ __preempt_schedule_context(); \ } while (0) #else -#define preempt_enable_notrace() preempt_enable_no_resched_notrace() +#define preempt_enable_notrace() \ +do { \ + barrier(); \ + __preempt_count_dec(); \ +} while (0) #endif #else /* !CONFIG_PREEMPT_COUNT */ @@ -116,6 +124,16 @@ do { \ #endif /* CONFIG_PREEMPT_COUNT */ +#ifdef MODULE +/* + * Modules have no business playing preemption tricks. + */ +#undef sched_preempt_enable_no_resched +#undef preempt_enable_no_resched +#undef preempt_enable_no_resched_notrace +#undef preempt_check_resched +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 9d8cf056e661..ecd3319dac33 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -25,13 +25,16 @@ static inline void pagefault_disable(void) static inline void pagefault_enable(void) { +#ifndef CONFIG_PREEMPT /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); preempt_count_dec(); - preempt_check_resched(); +#else + preempt_enable(); +#endif } #ifndef ARCH_HAS_NOCACHE_UACCESS -- cgit v1.2.3 From 35af99e646c7f7ea46dc2977601e9e71a51dadd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 19:38:42 +0100 Subject: sched/clock, x86: Use a static_key for sched_clock_stable In order to avoid the runtime condition and variable load turn sched_clock_stable into a static_key. Also provide a shorter implementation of local_clock() and cpu_clock(int) when sched_clock_stable==1. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 221876 215295 (cold) local_clock: 301773 234692 220773 (warm) sched_clock: 38375 25602 25659 (warm) local_clock: 100371 33265 27242 (warm) rdtsc: 27340 24214 24208 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 235941 237019 (cold) local_clock: 396890 297017 294819 (warm) sched_clock: 38194 25233 25609 (warm) local_clock: 143452 71234 71232 (warm) rdtsc: 27345 24245 24243 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-eummbdechzz37mwmpags1gjr@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a196cb7fc6f2..a03875221663 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1994,7 +1994,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) * but then during bootup it turns out that sched_clock() * is reliable after all: */ -extern int sched_clock_stable; +extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); +extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); -- cgit v1.2.3 From 0bd3a173d711857fc9f583eb5825386cc08f3948 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: sched/preempt, locking: Rework local_bh_{dis,en}able() Currently local_bh_disable() is out-of-line for no apparent reason. So inline it to save a few cycles on call/return nonsense, the function body is a single add on x86 (a few loads and store extra on load/store archs). Also expose two new local_bh functions: __local_bh_{dis,en}able_ip(unsigned long ip, unsigned int cnt); Which implement the actual local_bh_{dis,en}able() behaviour. The next patch uses the exposed @cnt argument to optimize bh lock functions. With build fixes from Jacob Pan. Cc: rjw@rjwysocki.net Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 32 +++++++++++++++++++++++++++++--- include/linux/hardirq.h | 1 + include/linux/preempt_mask.h | 1 - 3 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 27b1bcffe408..86c12c93e3cf 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -1,9 +1,35 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H -extern void local_bh_disable(void); +#include +#include + +#ifdef CONFIG_TRACE_IRQFLAGS +extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); +#else +static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +{ + preempt_count_add(cnt); + barrier(); +} +#endif + +static inline void local_bh_disable(void) +{ + __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); +} + extern void _local_bh_enable(void); -extern void local_bh_enable(void); -extern void local_bh_enable_ip(unsigned long ip); +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); + +static inline void local_bh_enable_ip(unsigned long ip) +{ + __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); +} + +static inline void local_bh_enable(void) +{ + __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); +} #endif /* _LINUX_BH_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d9cf963ac832..12d5f972f23f 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -5,6 +5,7 @@ #include #include #include +#include extern void synchronize_irq(unsigned int irq); diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h index b8d96bca4357..dbeec4d4a3be 100644 --- a/include/linux/preempt_mask.h +++ b/include/linux/preempt_mask.h @@ -2,7 +2,6 @@ #define LINUX_PREEMPT_MASK_H #include -#include /* * We put the hardirq and softirq counter into the preemption -- cgit v1.2.3 From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 12:22:37 +0100 Subject: sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding With various drivers wanting to inject idle time; we get people calling idle routines outside of the idle loop proper. Therefore we need to be extra careful about not missing TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations. While looking at this, I also realized there's a small window in the existing idle loop where we can miss TIF_NEED_RESCHED; when it hits right after the tif_need_resched() test at the end of the loop but right before the need_resched() test at the start of the loop. So move preempt_fold_need_resched() out of the loop where we're guaranteed to have TIF_NEED_RESCHED set. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 15 +++++++++++++++ include/linux/sched.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd9ddf8af205..59749fc48328 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,6 +134,21 @@ do { \ #undef preempt_check_resched #endif +#ifdef CONFIG_PREEMPT +#define preempt_set_need_resched() \ +do { \ + set_preempt_need_resched(); \ +} while (0) +#define preempt_fold_need_resched() \ +do { \ + if (tif_need_resched()) \ + set_preempt_need_resched(); \ +} while (0) +#else +#define preempt_set_need_resched() do { } while (0) +#define preempt_fold_need_resched() do { } while (0) +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/sched.h b/include/linux/sched.h index a03875221663..ffccdad050b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2745,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_task() */ + + preempt_fold_need_resched(); +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); -- cgit v1.2.3 From 37089834528be3ef8cbf927e47c753b3e272a856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:13:38 +0100 Subject: sched, net: Fixup busy_loop_us_clock() The only valid use of preempt_enable_no_resched() is if the very next line is schedule() or if we know preemption cannot actually be enabled by that statement due to known more preempt_count 'refs'. This busy_poll stuff looks to be completely and utterly broken, sched_clock() can return utter garbage with interrupts enabled (rare but still) and it can drift unbounded between CPUs. This means that if you get preempted/migrated and your new CPU is years behind on the previous CPU we get to busy spin for a _very_ long time. There is a _REASON_ sched_clock() warns about preemptability - papering over it with a preempt_disable()/preempt_enable_no_resched() is just terminal brain damage on so many levels. Replace sched_clock() usage with local_clock() which has a bounded drift between CPUs (<2 jiffies). There is a further problem with the entire busy wait poll thing in that the spin time is additive to the syscall timeout, not inclusive. Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: David S. Miller Cc: rui.zhang@intel.com Cc: jacob.jun.pan@linux.intel.com Cc: Mike Galbraith Cc: hpa@zytor.com Cc: Arjan van de Ven Cc: lenb@kernel.org Cc: rjw@rjwysocki.net Cc: Eliezer Tamir Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131119151338.GF3694@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/net/busy_poll.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 829627d7b846..1d67fb6b23a0 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -42,27 +42,10 @@ static inline bool net_busy_loop_on(void) return sysctl_net_busy_poll; } -/* a wrapper to make debug_smp_processor_id() happy - * we can use sched_clock() because we don't care much about precision - * we only care that the average is bounded - */ -#ifdef CONFIG_DEBUG_PREEMPT -static inline u64 busy_loop_us_clock(void) -{ - u64 rc; - - preempt_disable_notrace(); - rc = sched_clock(); - preempt_enable_no_resched_notrace(); - - return rc >> 10; -} -#else /* CONFIG_DEBUG_PREEMPT */ static inline u64 busy_loop_us_clock(void) { - return sched_clock() >> 10; + return local_clock() >> 10; } -#endif /* CONFIG_DEBUG_PREEMPT */ static inline unsigned long sk_busy_loop_end_time(struct sock *sk) { -- cgit v1.2.3 From 7479f3c9cf67edf5e8a76b21ea3726757f35cf53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jan 2014 17:05:04 +0100 Subject: sched: Move SCHED_RESET_ON_FORK into attr::sched_flags I noticed the new sched_{set,get}attr() calls didn't properly deal with the SCHED_RESET_ON_FORK hack. Instead of propagating the flags in high bits nonsense use the brand spanking new attr::sched_flags field. Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Dario Faggioli Link: http://lkml.kernel.org/r/20140115162242.GJ31570@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 2d5e49a2a6d2..34f9d7387d13 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -40,8 +40,13 @@ /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 +/* + * For the sched_{set,get}attr() calls + */ +#define SCHED_FLAG_RESET_ON_FORK 0x01 #endif /* _UAPI_LINUX_SCHED_H */ -- cgit v1.2.3