diff options
Diffstat (limited to 'kernel')
167 files changed, 13787 insertions, 4991 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..790d83c7d160 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,3 +5,4 @@ config_data.h config_data.gz timeconst.h hz.bc +x509_certificate_list diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b03..2a202a846757 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) + def_bool HIGH_RES_TIMERS diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..bc010ee272b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,56 +6,44 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + extable.o params.o posix-timers.o \ + kthread.o sys_ni.o posix-cpu-timers.o \ + hrtimer.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o lglock.o smpboot.o + async.o range.o groups.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += locking/ obj-y += power/ obj-y += printk/ obj-y += cpu/ obj-y += irq/ +obj-y += rcu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o -obj-$(CONFIG_LOCKDEP) += lockdep.o -ifeq ($(CONFIG_PROC_FS),y) -obj-$(CONFIG_LOCKDEP) += lockdep_proc.o -endif obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -81,12 +69,6 @@ obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_SECCOMP) += seccomp.o -obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_TREE_RCU) += rcutree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o -obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_TINY_RCU) += rcutiny.o -obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -141,19 +123,53 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### +# +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel so that they get loaded into the system trusted keyring during +# boot. # -# Pull the signing certificate and any extra certificates into the kernel +# We look in the source root and the build root for all files whose name ends +# in ".x509". Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates. # +############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 +X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ + $(or $(realpath $(CERT)),$(CERT)))) +X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif -quiet_cmd_touch = TOUCH $@ - cmd_touch = touch $@ +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/system_certificates.o: $(obj)/x509_certificate_list + +quiet_cmd_x509certs = CERTS $@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") -extra_certificates: - $(call cmd,touch) +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list + $(call if_changed,x509certs) -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +targets += $(obj)/.x509.list +$(obj)/.x509.list: + @echo $(X509_CERTIFICATES) >$@ +endif + +clean-files := x509_certificate_list .x509.list +ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..67ccf0e7cca9 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, - .free_group_priv = NULL, - .free_event_priv = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..2596fac5dcb4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; static int __init audit_watch_init(void) diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..9fd4246b04b8 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,8 @@ #include <linux/mmzone.h> #include <linux/kbuild.h> #include <linux/page_cgroup.h> +#include <linux/log2.h> +#include <linux/spinlock_types.h> void foo(void) { @@ -17,5 +19,9 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8bd9cfdc70d7..e2f46ba37f72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -41,7 +41,6 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/backing-dev.h> -#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/spinlock.h> @@ -56,15 +55,20 @@ #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/eventfd.h> -#include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> -#include <linux/file.h> #include <linux/atomic.h> /* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -89,6 +93,33 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +#define cgroup_assert_mutex_or_rcu_locked() \ + rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_mutex), \ + "cgroup_mutex or RCU read lock required"); + +#ifdef CONFIG_LOCKDEP +#define cgroup_assert_mutex_or_root_locked() \ + WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ + !lockdep_is_held(&cgroup_root_mutex))) +#else +#define cgroup_assert_mutex_or_root_locked() do { } while (0) +#endif + +/* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + /* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are @@ -111,81 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX (65535) -struct css_id { - /* - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_tryget() - * should be used for avoiding race. - */ - struct cgroup_subsys_state __rcu *css; - /* - * ID of this css. - */ - unsigned short id; - /* - * Depth in hierarchy which this ID belongs to. - */ - unsigned short depth; - /* - * ID is freed by RCU. (and lookup routine is RCU safe.) - */ - struct rcu_head rcu_head; - /* - * Hierarchy of CSS ID belongs to. - */ - unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -223,6 +179,8 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file); +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -285,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) } /** + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_mutex. + */ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_mutex)))) { } \ + else + +/** * for_each_subsys - iterate all loaded cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * - * Should be called under cgroup_mutex. + * Iterates through all loaded subsystems. Should be called under + * cgroup_mutex or cgroup_root_mutex. */ -#define for_each_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - !((ss) = cgroup_subsys[i]); })) { } \ +#define for_each_subsys(ss, ssid) \ + for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ + (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((ss) = cgroup_subsys[(ssid)])) { } \ else /** @@ -309,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[i]) || true); (i)++) -/* iterate each subsystem attached to a hierarchy */ -#define for_each_root_subsys(root, ss) \ - list_for_each_entry((ss), &(root)->subsys_list, sibling) - /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) @@ -387,9 +357,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@ -841,8 +808,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); - static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -891,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work) */ deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + cgroup_pidlist_destroy_all(cgrp); simple_xattrs_free(&cgrp->xattrs); @@ -908,7 +869,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -918,6 +879,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) struct cgroup *cgrp = dentry->d_fsdata; BUG_ON(!(cgroup_is_dead(cgrp))); + + /* + * XXX: cgrp->id is only used to look up css's. As cgroup + * and css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); @@ -932,11 +903,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1073,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgroup_css(cgroup_dummy_top, ss)); cgroup_css(cgrp, ss)->cgroup = cgrp; - list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgroup_css(cgrp, ss)); @@ -1092,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); @@ -1119,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; + int ssid; mutex_lock(&cgroup_root_mutex); - for_each_root_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) @@ -1385,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); } @@ -1394,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; - INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; @@ -1523,7 +1486,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = @@ -1716,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(ret); } -static void cgroup_kill_sb(struct super_block *sb) { +static void cgroup_kill_sb(struct super_block *sb) +{ struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; @@ -1999,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, bool threadgroup) { int retval, i, group_size; - struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroupfs_root *root = cgrp->root; + struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ struct task_struct *leader = tsk; struct task_and_cgroup *tc; @@ -2073,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 1: check that we can legitimately attach to the cgroup. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->can_attach) { - retval = ss->can_attach(css, &tset); + for_each_css(css, i, cgrp) { + if (css->ss->can_attach) { + retval = css->ss->can_attach(css, &tset); if (retval) { - failed_ss = ss; + failed_css = css; goto out_cancel_attach; } } @@ -2115,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 4: do subsystem attach callbacks. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->attach) - ss->attach(css, &tset); - } + for_each_css(css, i, cgrp) + if (css->ss->attach) + css->ss->attach(css, &tset); /* * step 5: success! and cleanup @@ -2137,13 +2096,11 @@ out_put_css_set_refs: } out_cancel_attach: if (retval) { - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss == failed_ss) + for_each_css(css, i, cgrp) { + if (css == failed_css) break; - if (ss->cancel_attach) - ss->cancel_attach(css, &tset); + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -2171,7 +2128,7 @@ retry_find_task: tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - ret= -ESRCH; + ret = -ESRCH; goto out_unlock_cgroup; } /* @@ -2283,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, return 0; } -static int cgroup_release_agent_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_release_agent_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = seq_css(seq)->cgroup; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2296,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, return 0; } -static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) { - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cfe->css; + size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; + char *buf; + int ret; - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) + if (nbytes >= max_bytes) return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - buffer[nbytes] = 0; /* nul-terminate */ - if (cft->write_u64) { - u64 val = simple_strtoull(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(css, cft, val); + buf = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, userbuf, nbytes)) { + ret = -EFAULT; + goto out_free; + } + + buf[nbytes] = '\0'; + + if (cft->write_string) { + ret = cft->write_string(css, cft, strstrip(buf)); + } else if (cft->write_u64) { + unsigned long long v; + ret = kstrtoull(buf, 0, &v); + if (!ret) + ret = cft->write_u64(css, cft, v); + } else if (cft->write_s64) { + long long v; + ret = kstrtoll(buf, 0, &v); + if (!ret) + ret = cft->write_s64(css, cft, v); + } else if (cft->trigger) { + ret = cft->trigger(css, (unsigned int)cft->private); } else { - s64 val = simple_strtoll(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(css, cft, val); + ret = -EINVAL; } - if (!retval) - retval = nbytes; - return retval; +out_free: + kfree(buf); + return ret ?: nbytes; } -static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; + struct cftype *cft = seq_cft(seq); - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; + if (cft->seq_start) { + return cft->seq_start(seq, ppos); + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; } - - buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(css, cft, strstrip(buffer)); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; } -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) +static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + struct cftype *cft = seq_cft(seq); - if (cft->write) - return cft->write(css, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(css, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(css, (unsigned int)cft->private); - return ret ? ret : nbytes; + if (cft->seq_next) { + return cft->seq_next(seq, v, ppos); + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; } - return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(css, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); + struct cftype *cft = seq_cft(seq); - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + if (cft->seq_stop) + cft->seq_stop(seq, v); } -static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(css, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} + struct cftype *cft = seq_cft(m); + struct cgroup_subsys_state *css = seq_css(m); -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + if (cft->seq_show) + return cft->seq_show(m, arg); - if (cft->read) - return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); - return -EINVAL; -} - -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) -{ - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); -} - -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cfent *cfe = m->private; - struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(css, cft, &cb); - } - return cft->read_seq_string(css, cft, m); + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); + else if (cft->read_s64) + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); + else + return -EINVAL; + return 0; } -static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = single_release, +static struct seq_operations cgroup_seq_operations = { + .start = cgroup_seqfile_start, + .next = cgroup_seqfile_next, + .stop = cgroup_seqfile_stop, + .show = cgroup_seqfile_show, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2472,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); struct cgroup_subsys_state *css; + struct cgroup_open_file *of; int err; err = generic_file_open(inode, file); @@ -2501,30 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file) WARN_ON_ONCE(cfe->css && cfe->css != css); cfe->css = css; - if (cft->read_map || cft->read_seq_string) { - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, cfe); - } else if (cft->open) { - err = cft->open(inode, file); + of = __seq_open_private(file, &cgroup_seq_operations, + sizeof(struct cgroup_open_file)); + if (of) { + of->cfe = cfe; + return 0; } - if (css->ss && err) + if (css->ss) css_put(css); - return err; + return -ENOMEM; } static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); struct cgroup_subsys_state *css = cfe->css; - int ret = 0; - if (cft->release) - ret = cft->release(inode, file); if (css->ss) css_put(css); - return ret; + return seq_release_private(inode, file); } /* @@ -2635,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) } static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, + .read = seq_read, .write = cgroup_file_write, .llseek = generic_file_llseek, .open = cgroup_file_open, @@ -2660,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -/* - * Check if a file is a control file - */ -static inline struct cftype *__file_cft(struct file *file) -{ - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { @@ -2727,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->mode) return cft->mode; - if (cft->read || cft->read_u64 || cft->read_s64 || - cft->read_map || cft->read_seq_string) + if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write || cft->write_u64 || cft->write_s64 || - cft->write_string || cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) mode |= S_IWUSR; return mode; @@ -3028,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void) * @parent_css: css whose children to walk * * This function returns the next child of @parent_css and should be called - * under RCU read lock. The only requirement is that @parent_css and - * @pos_css are accessible. The next sibling is guaranteed to be returned - * regardless of their states. + * under either cgroup_mutex or RCU read lock. The only requirement is + * that @parent_css and @pos_css are accessible. The next sibling is + * guaranteed to be returned regardless of their states. */ struct cgroup_subsys_state * css_next_child(struct cgroup_subsys_state *pos_css, @@ -3040,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3087,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child); * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @root are accessible and @pos is a descendant of @root. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @root are accessible and @pos is a descendant of @root. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3098,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3129,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct rightmost descendant as long as @pos is - * accessible. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct rightmost descendant as + * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3175,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @cgroup are accessible and @pos is a descendant of + * @cgroup. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -3186,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3525,14 +3423,12 @@ struct cgroup_pidlist { pid_t *list; /* how many elements the above list has */ int length; - /* how many files are using the current array */ - int use_count; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore rwsem; + /* for delayed destruction */ + struct delayed_work destroy_dwork; }; /* @@ -3548,6 +3444,7 @@ static void *pidlist_allocate(int count) else return kmalloc(count * sizeof(pid_t), GFP_KERNEL); } + static void pidlist_free(void *p) { if (is_vmalloc_addr(p)) @@ -3557,6 +3454,47 @@ static void pidlist_free(void *p) } /* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ @@ -3586,52 +3524,92 @@ after: return dest; } +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if + * sane_behavior so that no such expectation exists in the new interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; + + return (a << 1) | (b >> 1); +} + +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +{ + if (cgroup_sane_behavior(cgrp)) + return pid_fry(pid); + else + return pid; +} + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - /* - * We can't drop the pidlist_mutex before taking the l->rwsem in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* make sure l doesn't vanish out from under us */ - down_write(&l->rwsem); - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - } + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); + if (!l) return l; - } - init_rwsem(&l->rwsem); - down_write(&l->rwsem); + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; - l->key.ns = get_pid_ns(ns); + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3648,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct task_struct *tsk; struct cgroup_pidlist *l; + lockdep_assert_held(&cgrp->pidlist_mutex); + /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -3674,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); + if (cgroup_sane_behavior(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); - l = cgroup_pidlist_find(cgrp, type); + + l = cgroup_pidlist_find_create(cgrp, type); if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } - /* store array, freeing old if necessary - lock already held */ + + /* store array, freeing old if necessary */ pidlist_free(l->list); l->list = array; l->length = length; - l->use_count++; - up_write(&l->rwsem); *lp = l; return 0; } @@ -3761,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; - int *iter; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - down_read(&l->rwsem); if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) index = mid + 1; else end = mid; @@ -3785,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = *iter; + *pos = cgroup_pid_fry(cgrp, *iter); return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pidlist *l = s->private; - up_read(&l->rwsem); + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -3808,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = *p; + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); return p; } } @@ -3829,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { .show = cgroup_pidlist_show, }; -static void cgroup_release_pid_array(struct cgroup_pidlist *l) -{ - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->rwsem); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->rwsem); - kfree(l); - return; - } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->rwsem); -} - -static int cgroup_pidlist_release(struct inode *inode, struct file *file) -{ - struct cgroup_pidlist *l; - if (!(file->f_mode & FMODE_READ)) - return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); - return seq_release(inode, file); -} - -static const struct file_operations cgroup_pidlist_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, - .release = cgroup_pidlist_release, -}; - -/* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. - */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; - int retval; - - /* Nothing to do for write-only files */ - if (!(file->f_mode & FMODE_READ)) - return 0; - - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; - - retval = seq_open(file, &cgroup_pidlist_seq_operations); - if (retval) { - cgroup_release_pid_array(l); - return retval; - } - ((struct seq_file *)file->private_data)->private = l; - return 0; -} -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3949,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp) deactivate_super(sb); } -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format '<event_fd> <control_fd> <args>'. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) -{ - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - -out_put_css: - css_put(event->css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4164,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", - .open = cgroup_procs_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write_u64 = cgroup_procs_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, - { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, @@ -4183,7 +3914,7 @@ static struct cftype cgroup_base_files[] = { { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_sane_behavior_show, + .seq_show = cgroup_sane_behavior_show, }, /* @@ -4194,9 +3925,12 @@ static struct cftype cgroup_base_files[] = { { .name = "tasks", .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { @@ -4208,7 +3942,7 @@ static struct cftype cgroup_base_files[] = { { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_release_agent_show, + .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, }, @@ -4240,21 +3974,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) goto err; } } - - /* This cgroup is ready now */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (id) - rcu_assign_pointer(id->css, css); - } - return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@ -4306,7 +4025,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4314,6 +4033,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); + rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4323,7 +4043,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->cgroup = cgrp; css->ss = ss; css->flags = 0; - css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@ -4369,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } +/** + * create_css - create a cgroup_subsys_state + * @cgrp: the cgroup new css will be associated with + * @ss: the subsys of new css + * + * Create a new css associated with @cgrp - @ss pair. On success, the new + * css is online and installed in @cgrp with all interface files created. + * Returns 0 on success, -errno on failure. + */ +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *css; + int err; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + css = ss->css_alloc(cgroup_css(parent, ss)); + if (IS_ERR(css)) + return PTR_ERR(css); + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free; + + init_css(css, ss, cgrp); + + err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + if (err) + goto err_free; + + err = online_css(css); + if (err) + goto err_free; + + dget(cgrp->dentry); + css_get(css->parent); + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } + + return 0; + +err_free: + percpu_ref_cancel_init(&css->refcnt); + ss->css_free(css); + return err; +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4380,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { - struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int err = 0; + int ssid, err = 0; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4440,29 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css; - - css = ss->css_alloc(cgroup_css(parent, ss)); - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_free_all; - } - css_ar[ss->subsys_id] = css; - - err = percpu_ref_init(&css->refcnt, css_release); - if (err) - goto err_free_all; - - init_css(css, ss, cgrp); - - if (ss->use_id) { - err = alloc_css_id(css); - if (err) - goto err_free_all; - } - } - /* * Create directory. cgroup_create_file() returns with the new * directory locked on success so that it can be populated without @@ -4470,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_free_all; + goto err_unlock; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4479,59 +4230,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry and the parent css */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - dget(dentry); - css_get(css->parent); - } - /* hold a ref to the parent's dentry */ dget(parent->dentry); - /* creation succeeded, notify subsystems */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - err = online_css(css); - if (err) - goto err_destroy; - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } - } - + /* + * @cgrp is now fully operational. If something fails after this + * point, it'll be released via the normal destruction path. + */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; - err = cgroup_populate_dir(cgrp, root->subsys_mask); - if (err) - goto err_destroy; + /* let's create and online css's */ + for_each_subsys(ss, ssid) { + if (root->subsys_mask & (1 << ssid)) { + err = create_css(cgrp, ss); + if (err) + goto err_destroy; + } + } mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; -err_free_all: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } +err_unlock: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); @@ -4603,7 +4329,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@ -4666,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; - struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; struct cgroup *child; bool empty; + int ssid; lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4705,8 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * will be invoked to perform the rest of destruction once the * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss)); + for_each_css(css, ssid, cgrp) + kill_css(css); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4741,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) dget(d); cgroup_d_remove_dir(d); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; }; @@ -4775,14 +4489,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - /* - * We should remove the cgroup object from idr before its grace - * period starts, so we won't be looking up a cgroup while the - * cgroup is being freed. - */ - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); @@ -4831,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ @@ -4905,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); cgroup_subsys[ss->subsys_id] = ss; /* @@ -4916,21 +4622,15 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_css() because it sets css->id. */ - if (ss->use_id) { - ret = cgroup_init_idr(ss, css); - if (ret) - goto err_unload; - } /* * Now we need to entangle the css into the existing css_sets. unlike @@ -4956,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ret = online_css(css); - if (ret) + if (ret) { + ss->css_free(css); goto err_unload; + } /* success! */ + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; err_unload: + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); /* @ss can't be mounted here as try_module_get() would fail */ cgroup_unload_subsys(ss); @@ -4982,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cgrp_cset_link *link; + struct cgroup_subsys_state *css; BUG_ON(ss->module == NULL); @@ -4993,18 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss)); - - if (ss->use_id) - idr_destroy(&ss->idr); + css = cgroup_css(cgroup_dummy_top, ss); + if (css) + offline_css(css); /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; - /* remove subsystem from the dummy root's list of subsystems */ - list_del_init(&ss->sibling); - /* * disentangle the css from all css_sets attached to the dummy * top. as in loading, we need to pay our respects to the hashtable @@ -5025,12 +4727,13 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. note that this - * also takes care of freeing the css_id. + * the cgrp->subsys pointer to find their state. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss)); + if (css) + ss->css_free(css); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); } EXPORT_SYMBOL_GPL(cgroup_unload_subsys); @@ -5097,8 +4800,6 @@ int __init cgroup_init(void) for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@ -5139,6 +4840,31 @@ out: return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy @@ -5178,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int count = 0; + int ssid, count = 0; seq_printf(m, "%d:", root->hierarchy_id); - for_each_root_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); @@ -5518,196 +5245,21 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_raw(css->id); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - - child_id = rcu_dereference(child->id); - if (!child_id) - return false; - root_id = rcu_dereference(root->id); - if (!root_id) - return false; - if (child_id->depth < root_id->depth) - return false; - if (child_id->stack[root_id->depth] != root_id->id) - return false; - return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int ret, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - - idr_preload(GFP_KERNEL); - spin_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); - spin_unlock(&ss->id_lock); - idr_preload_end(); - - /* Returns error when there are no free spaces for new ID.*/ - if (ret < 0) - goto err_out; - - newid->id = ret; - newid->depth = depth; - return newid; -err_out: - kfree(newid); - return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - spin_lock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - RCU_INIT_POINTER(newid->css, rootcss); - RCU_INIT_POINTER(rootcss->id, newid); - return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ - struct cgroup_subsys_state *parent_css = css_parent(child_css); - struct css_id *child_id, *parent_id; - int i, depth; - - parent_id = rcu_dereference_protected(parent_css->id, true); - depth = parent_id->depth + 1; - - child_id = get_new_cssid(child_css->ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under RCU read lock. The caller is responsible for - * pinning the returned css if it needs to be accessed outside the RCU - * critical section. + * Must be called under cgroup_mutex or RCU read lock. The caller is + * responsible for pinning the returned css if it needs to be accessed + * outside the critical section. */ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* is @dentry a cgroup dir? */ if (!dentry->d_inode || @@ -5730,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "css_from_id() needs proper protection"); + cgroup_assert_mutex_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5780,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, return count; } -static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *seq) +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; @@ -5807,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_css_links_read(struct seq_file *seq, void *v) { + struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; read_lock(&css_set_lock); @@ -5855,12 +5403,12 @@ static struct cftype debug_files[] = { { .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, + .seq_show = current_css_set_cg_links_read, }, { .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, + .seq_show = cgroup_css_links_read, }, { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f0ff64d0ebaa..6c3154e477f6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -301,10 +301,9 @@ out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int freezer_read(struct seq_file *m, void *v) { - struct cgroup_subsys_state *pos; + struct cgroup_subsys_state *css = seq_css(m), *pos; rcu_read_lock(); @@ -458,7 +457,7 @@ static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = freezer_read, + .seq_show = freezer_read, .write_string = freezer_write, }, { diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..6cb20d2e7ee0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -53,10 +53,10 @@ void context_tracking_user_enter(void) /* * Repeat the user_enter() check here because some archs may be calling * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the static key - * check. + * go further. Repeat the check here until they support the inline static + * key check. */ - if (!static_key_false(&context_tracking_enabled)) + if (!context_tracking_is_enabled()) return; /* @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -void __sched notrace preempt_schedule_context(void) +asmlinkage void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; @@ -160,7 +160,7 @@ void context_tracking_user_exit(void) { unsigned long flags; - if (!static_key_false(&context_tracking_enabled)) + if (!context_tracking_is_enabled()) return; if (in_interrupt()) diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..deff2e693766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } + + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + smpboot_park_threads(cpu); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ @@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu) { int err = 0; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; - pg_data_t *pgdat; -#endif - if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu) return -EINVAL; } -#ifdef CONFIG_MEMORY_HOTPLUG - nid = cpu_to_node(cpu); - if (!node_online(nid)) { - err = mem_online_node(nid); - if (err) - return err; - } - - pgdat = NODE_DATA(nid); - if (!pgdat) { - printk(KERN_ERR - "Can't online cpu %d due to NULL pgdat\n", cpu); - return -ENOMEM; - } - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } -#endif + err = try_online_node(cpu_to_node(cpu)); + if (err) + return err; cpu_maps_update_begin(); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..277f494c2a9a 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!need_resched()) + while (!tif_need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { - current_clr_polling(); - if (!need_resched()) { + if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); @@ -103,10 +102,20 @@ static void cpu_idle_loop(void) } else { local_irq_enable(); } - current_set_polling(); + __current_set_polling(); } arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } @@ -129,7 +138,7 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - current_set_polling(); + __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e13c43..4410ac6a55f1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) + if (need_loop) { + local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); + } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; - if (need_loop) + if (need_loop) { write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); + } task_unlock(tsk); } @@ -1727,66 +1731,41 @@ out_unlock: * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. */ - -static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static int cpuset_common_seq_show(struct seq_file *sf, void *v) { - size_t count; - - mutex_lock(&callback_mutex); - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); - mutex_unlock(&callback_mutex); + struct cpuset *cs = css_cs(seq_css(sf)); + cpuset_filetype_t type = seq_cft(sf)->private; + ssize_t count; + char *buf, *s; + int ret = 0; - return count; -} - -static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - size_t count; + count = seq_get_buf(sf, &buf); + s = buf; mutex_lock(&callback_mutex); - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; switch (type) { case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); + s += cpulist_scnprintf(s, count, cs->cpus_allowed); break; case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); + s += nodelist_scnprintf(s, count, cs->mems_allowed); break; default: - retval = -EINVAL; - goto out; + ret = -EINVAL; + goto out_unlock; } - *s++ = '\n'; - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; + if (s < buf + count - 1) { + *s++ = '\n'; + seq_commit(sf, s - buf); + } else { + seq_commit(sf, -1); + } +out_unlock: + mutex_unlock(&callback_mutex); + return ret; } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1843,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static struct cftype files[] = { { .name = "cpus", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, @@ -1851,7 +1830,7 @@ static struct cftype files[] = { { .name = "mems", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..7d2f35e5df2f 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -575,8 +575,12 @@ return_normal: raw_spin_lock(&dbg_slave_lock); #ifdef CONFIG_SMP + /* If send_ready set, slaves are already waiting */ + if (ks->send_ready) + atomic_set(ks->send_ready, 1); + /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step) && kgdb_do_roundup) + else if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; ks->signo = signo; ks->err_code = ecode; - ks->kgdb_usethreadid = 0; ks->linux_regs = regs; if (kgdb_reenter_check(ks)) @@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } +int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) +{ +#ifdef CONFIG_SMP + if (!kgdb_io_ready(0) || !send_ready) + return 1; + + if (kgdb_info[cpu].enter_kgdb == 0) { + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + memset(ks, 0, sizeof(struct kgdb_state)); + ks->cpu = cpu; + ks->ex_vector = trapnr; + ks->signo = SIGTRAP; + ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; + ks->linux_regs = regs; + ks->send_ready = send_ready; + kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); + return 0; + } +#endif + return 1; +} + static void kgdb_console_write(struct console *co, const char *s, unsigned count) { diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 2235967e78b0..572aa4f5677c 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -26,6 +26,7 @@ struct kgdb_state { unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; + atomic_t *send_ready; }; /* Exception state values */ @@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); +#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { return DBG_PASS_EVENT; } +#define KGDB_KDB_REASON_SYSTEM_NMI 0 #endif /* CONFIG_KGDB_KDB */ #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 328d18ef31e4..8859ca34dcfe 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; - if (in_nmi()) + if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) + reason = KDB_REASON_SYSTEM_NMI; + + else if (in_nmi()) reason = KDB_REASON_NMI; for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 00eb8f7fbf41..0b097c8a1e50 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, instruction_pointer(regs)); kdb_dumpregs(regs); break; + case KDB_REASON_SYSTEM_NMI: + kdb_printf("due to System NonMaskable Interrupt\n"); + break; case KDB_REASON_NMI: kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d473988c1d0b..54996b71e66d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; - /* Though tsk->delays accessed later, early exit avoids - * unnecessary returning of other data - */ - if (!tsk->delays) - goto done; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); cputime_to_timespec(utime + stime, &ts); @@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_count += tsk->delays->freepages_count; spin_unlock_irqrestore(&tsk->delays->lock, flags); -done: return 0; } diff --git a/kernel/elfcore.c b/kernel/elfcore.c index ff915efef66d..e556751d15d9 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,23 +1,19 @@ #include <linux/elf.h> #include <linux/fs.h> #include <linux/mm.h> - -#include <asm/elf.h> - +#include <linux/binfmts.h> Elf_Half __weak elf_core_extra_phdrs(void) { return 0; } -int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, - unsigned long limit) +int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { return 1; } -int __weak elf_core_write_extra_data(struct file *file, size_t *size, - unsigned long limit) +int __weak elf_core_write_extra_data(struct coredump_params *cprm) { return 1; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 953c14348375..56003c6edfd3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) + PERF_FLAG_PID_CGROUP |\ + PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks @@ -175,8 +176,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = - ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; void update_perf_cpu_limits(void) { @@ -184,7 +185,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - atomic_set(&perf_sample_allowed_ns, tmp); + ACCESS_ONCE(perf_sample_allowed_ns) = tmp; } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +194,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -228,14 +229,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (atomic_read(&perf_sample_allowed_ns) == 0) + if (allowed_ns == 0) return; /* decay the counter by 1 average sample */ @@ -251,7 +253,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + if (avg_local_sample_len <= allowed_ns) return; if (max_samples_per_tick <= 1) @@ -262,10 +264,9 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %d), lowering " + "perf samples too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, - atomic_read(&perf_sample_allowed_ns), + avg_local_sample_len, allowed_ns, sysctl_perf_event_sample_rate); update_perf_cpu_limits(); @@ -899,6 +900,7 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } + ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1138,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + + ctx->generation++; } /* @@ -1201,6 +1205,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + size += sizeof(data->txn); + event->header_size = size; } @@ -1310,6 +1317,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + + ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -1388,6 +1397,8 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + perf_pmu_disable(event->pmu); + event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; @@ -1404,6 +1415,8 @@ event_sched_out(struct perf_event *event, ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + + perf_pmu_enable(event->pmu); } static void @@ -1644,6 +1657,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { u64 tstamp = perf_event_time(event); + int ret = 0; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -1666,10 +1680,13 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); + perf_pmu_disable(event->pmu); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; - return -EAGAIN; + ret = -EAGAIN; + goto out; } event->tstamp_running += tstamp - event->tstamp_stopped; @@ -1685,7 +1702,10 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - return 0; +out: + perf_pmu_enable(event->pmu); + + return ret; } static int @@ -2146,22 +2166,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; + /* Pinning disables the swap optimization */ + if (ctx1->pin_count || ctx2->pin_count) + return 0; + + /* If ctx1 is the parent of ctx2 */ + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) + return 1; + + /* If ctx2 is the parent of ctx1 */ + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) + return 1; + + /* + * If ctx1 and ctx2 have the same parent; we flatten the parent + * hierarchy, see perf_event_init_context(). + */ + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && + ctx1->parent_gen == ctx2->parent_gen) + return 1; + + /* Unmatched */ + return 0; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2210,9 +2246,6 @@ static void __perf_event_sync_stat(struct perf_event *event, perf_event_update_userpage(next_event); } -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { @@ -2244,7 +2277,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent; + struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2256,10 +2289,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { + if (!next_ctx) + goto unlock; + + parent = rcu_dereference(ctx->parent_ctx); + next_parent = rcu_dereference(next_ctx->parent_ctx); + + /* If neither context have a parent context; they cannot be clones. */ + if (!parent && !next_parent) + goto unlock; + + if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2287,6 +2328,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } +unlock: rcu_read_unlock(); if (do_switch) { @@ -2713,6 +2755,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + perf_pmu_disable(event->pmu); + hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { @@ -2722,7 +2766,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } if (!event->attr.freq || !event->attr.sample_freq) - continue; + goto next; /* * stop the event and update event->count @@ -2744,6 +2788,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); + next: + perf_pmu_enable(event->pmu); } perf_pmu_enable(ctx->pmu); @@ -3497,7 +3543,7 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - int ret = 0; + int ret = 0, active; u64 value; if (!is_sampling_event(event)) @@ -3521,6 +3567,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->attr.sample_period = value; event->hw.sample_period = value; } + + active = (event->state == PERF_EVENT_STATE_ACTIVE); + if (active) { + perf_pmu_disable(ctx->pmu); + event->pmu->stop(event, PERF_EF_UPDATE); + } + + local64_set(&event->hw.period_left, 0); + + if (active) { + event->pmu->start(event, PERF_EF_RELOAD); + perf_pmu_enable(ctx->pmu); + } + unlock: raw_spin_unlock_irq(&ctx->lock); @@ -4572,6 +4632,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + perf_output_put(handle, data->txn); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5100,27 +5163,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); + char *name; if (file) { struct inode *inode; dev_t dev; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = "//enomem"; + goto cpy_name; + } /* - * d_path works from the end of the rb backwards, so we + * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; + name = "//toolong"; + goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -5128,34 +5190,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); - + goto got_name; } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp) - 1); - tmp[sizeof(tmp) - 1] = '\0'; - goto got_name; - } + name = (char *)arch_vma_name(vma); + if (name) + goto cpy_name; - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && + if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && + name = "[heap]"; + goto cpy_name; + } + if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; + name = "[stack]"; + goto cpy_name; } - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; + name = "//anon"; + goto cpy_name; } +cpy_name: + strlcpy(tmp, name, sizeof(tmp)); + name = tmp; got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(name)+1; + while (!IS_ALIGNED(size, sizeof(u64))) + name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; @@ -5643,11 +5710,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5681,9 +5743,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); @@ -6292,6 +6351,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, @@ -6336,17 +6396,19 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_RW(perf_event_mux_interval_ms), - __ATTR_NULL, +static struct attribute *pmu_dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_perf_event_mux_interval_ms.attr, + NULL, }; +ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", - .dev_attrs = pmu_dev_attrs, + .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) @@ -6623,6 +6685,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); + INIT_LIST_HEAD(&event->active_entry); + INIT_HLIST_NODE(&event->hlist_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -6933,6 +6998,7 @@ SYSCALL_DEFINE5(perf_event_open, int event_fd; int move_group = 0; int err; + int f_flags = O_RDWR; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -6961,7 +7027,10 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd(); + if (flags & PERF_FLAG_FD_CLOEXEC) + f_flags |= O_CLOEXEC; + + event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; @@ -7083,7 +7152,8 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, + f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); goto err_context; @@ -7126,7 +7196,6 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7209,7 +7278,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned int \ +static inline unsigned long \ func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned int len) \ + const void *buf, unsigned long len) \ { \ unsigned long size, written; \ \ do { \ - size = min_t(unsigned long, handle->size, len); \ - \ + size = min(handle->size, len); \ written = memcpy_func(handle->addr, buf, size); \ + written = size - written; \ \ len -= written; \ handle->addr += written; \ @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ return len; \ } -static inline int memcpy_common(void *dst, const void *src, size_t n) +static inline unsigned long +memcpy_common(void *dst, const void *src, unsigned long n) { memcpy(dst, src, n); - return n; + return 0; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) -#define MEMCPY_SKIP(dst, src, n) (n) +static inline unsigned long +memcpy_skip(void *dst, const void *src, unsigned long n) +{ + return 0; +} -DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) +DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) #ifndef arch_perf_out_copy_user -#define arch_perf_out_copy_user __copy_from_user_inatomic +#define arch_perf_out_copy_user arch_perf_out_copy_user + +static inline unsigned long +arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) +{ + unsigned long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, n); + pagefault_enable(); + + return ret; +} #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9c2ddfbf4525..146a5792b1d2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,40 +12,10 @@ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/circ_buf.h> #include "internal.h" -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long sz = perf_data_size(rb); - unsigned long mask = sz - 1; - - /* - * check if user-writable - * overwrite : over-write its own tail - * !overwrite: buffer possibly drops events. - */ - if (rb->overwrite) - return true; - - /* - * verify that payload is not bigger than buffer - * otherwise masking logic may fail to detect - * the "not enough space" condition - */ - if ((head - offset) > sz) - return false; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); @@ -91,19 +61,20 @@ again: * * kernel user * - * READ ->data_tail READ ->data_head - * smp_mb() (A) smp_rmb() (C) - * WRITE $data READ $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->data_head WRITE ->data_tail + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } * * Where A pairs with D, and B pairs with C. * - * I don't think A needs to be a full barrier because we won't in fact - * write data until we see the store from userspace. So we simply don't - * issue the data WRITE until we observe it. Be conservative for now. + * In our case (A) is a control dependency that separates the load of + * the ->data_tail and the stores of $data. In case ->data_tail + * indicates there is no room in the buffer to store $data we do not. * - * OTOH, D needs to be a full barrier since it separates the data READ + * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C @@ -111,12 +82,12 @@ again: * * See perf_output_begin(). */ - smp_wmb(); + smp_wmb(); /* B, matches C */ rb->user_page->data_head = head; /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read rb->head. + * Now check if we missed an update -- rely on previous implied + * compiler barriers to force a re-read. */ if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); @@ -135,8 +106,7 @@ int perf_output_begin(struct perf_output_handle *handle, { struct ring_buffer *rb; unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; + int have_lost, page_shift; struct { struct perf_event_header header; u64 id; @@ -151,57 +121,72 @@ int perf_output_begin(struct perf_output_handle *handle, event = event->parent; rb = rcu_dereference(event->rb); - if (!rb) + if (unlikely(!rb)) goto out; - handle->rb = rb; - handle->event = event; - - if (!rb->nr_pages) + if (unlikely(!rb->nr_pages)) goto out; + handle->rb = rb; + handle->event = event; + have_lost = local_read(&rb->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; + if (unlikely(have_lost)) { + size += sizeof(lost_event); + if (event->attr.sample_id_all) + size += event->id_header_size; } perf_output_get_handle(handle); do { + tail = ACCESS_ONCE(rb->user_page->data_tail); + offset = head = local_read(&rb->head); + if (!rb->overwrite && + unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) + goto fail; + /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. + * The above forms a control dependency barrier separating the + * @tail load above from the data stores below. Since the @tail + * load is required to compute the branch to fail below. + * + * A, matches D; the full memory barrier userspace SHOULD issue + * after reading the data and before storing the new tail + * position. * * See perf_output_put_handle(). */ - tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_mb(); - offset = head = local_read(&rb->head); + head += size; - if (unlikely(!perf_output_space(rb, tail, offset, head))) - goto fail; } while (local_cmpxchg(&rb->head, offset, head) != offset); - if (head - local_read(&rb->wakeup) > rb->watermark) + /* + * We rely on the implied barrier() by local_cmpxchg() to ensure + * none of the data stores below can be lifted up by the compiler. + */ + + if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(rb)); - handle->page &= rb->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); - handle->addr = rb->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + page_shift = PAGE_SHIFT + page_order(rb); - if (have_lost) { + handle->page = (offset >> page_shift) & (rb->nr_pages - 1); + offset &= (1UL << page_shift) - 1; + handle->addr = rb->data_pages[handle->page] + offset; + handle->size = (1UL << page_shift) - offset; + + if (unlikely(have_lost)) { + struct perf_sample_data sample_data; + + lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); + perf_event_header__init_id(&lost_event.header, + &sample_data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, &sample_data); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..307d87c0991a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,6 +35,7 @@ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> +#include <linux/task_work.h> #include <linux/uprobes.h> @@ -72,6 +73,17 @@ struct uprobe { struct inode *inode; /* Also hold a ref to inode */ loff_t offset; unsigned long flags; + + /* + * The generic code assumes that it has two members of unknown type + * owned by the arch-specific code: + * + * insn - copy_insn() saves the original instruction here for + * arch_uprobe_analyze_insn(). + * + * ixol - potentially modified instruction to execute out of + * line, copied to xol_area by xol_get_insn_slot(). + */ struct arch_uprobe arch; }; @@ -85,6 +97,29 @@ struct return_instance { }; /* + * Execute out of line area: anonymous executable mapping installed + * by the probed task to execute the copy of the original instruction + * mangled by set_swbp(). + * + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct page *page; + + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ +}; + +/* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. @@ -244,12 +279,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and - * write_opcode accordingly. This would never be a problem for archs that - * have fixed length instructions. + * uprobe_write_opcode accordingly. This would never be a problem for archs + * that have fixed length instructions. */ /* - * write_opcode - write the opcode at a given virtual address. + * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -260,7 +295,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, unsigned long vaddr, +int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; @@ -314,7 +349,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -329,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -503,9 +538,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) return ret; } -static int -__copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, loff_t offset) +static int __copy_insn(struct address_space *mapping, struct file *filp, + void *insn, int nbytes, loff_t offset) { struct page *page; @@ -527,28 +561,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, static int copy_insn(struct uprobe *uprobe, struct file *filp) { - struct address_space *mapping; - unsigned long nbytes; - int bytes; + struct address_space *mapping = uprobe->inode->i_mapping; + loff_t offs = uprobe->offset; + void *insn = &uprobe->arch.insn; + int size = sizeof(uprobe->arch.insn); + int len, err = -EIO; - nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); - mapping = uprobe->inode->i_mapping; - - /* Instruction at end of binary; copy only available bytes */ - if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) - bytes = uprobe->inode->i_size - uprobe->offset; - else - bytes = MAX_UINSN_BYTES; + /* Copy only available bytes, -EIO if nothing was read */ + do { + if (offs >= i_size_read(uprobe->inode)) + break; - /* Instruction at the page-boundary; copy bytes in second page */ - if (nbytes < bytes) { - int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes); + len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); + err = __copy_insn(mapping, filp, insn, len, offs); if (err) - return err; - bytes = nbytes; - } - return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); + break; + + insn += len; + offs += len; + size -= len; + } while (size); + + return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, @@ -569,14 +603,14 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; - /* write_opcode() assumes we don't cross page boundary */ + /* uprobe_write_opcode() assumes we don't cross page boundary */ BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); @@ -1096,21 +1130,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon } /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - struct mm_struct *mm = current->mm; int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; - ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; - goto fail; + if (!area->vaddr) { + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, + PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1120,30 +1155,19 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; - ret = 0; fail: up_write(&mm->mmap_sem); return ret; } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; - struct xol_area *area; uprobe_opcode_t insn = UPROBE_SWBP_INSN; + struct xol_area *area; - area = mm->uprobes_state.xol_area; - if (area) - goto ret; - - area = kzalloc(sizeof(*area), GFP_KERNEL); + area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1155,13 +1179,14 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; - /* allocate first slot of task's xol_area for the return probes */ + area->vaddr = vaddr; + init_waitqueue_head(&area->wq); + /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); atomic_set(&area->slot_count, 1); - init_waitqueue_head(&area->wq); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); - if (!xol_add_vma(area)) + if (!xol_add_vma(mm, area)) return area; __free_page(area->page); @@ -1170,9 +1195,25 @@ static struct xol_area *get_xol_area(void) free_area: kfree(area); out: + return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ + struct mm_struct *mm = current->mm; + struct xol_area *area; + + if (!mm->uprobes_state.xol_area) + __create_xol_area(0); + area = mm->uprobes_state.xol_area; - ret: - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ return area; } @@ -1256,7 +1297,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); + copy_to_page(area->page, xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1345,14 +1387,6 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ - t->utask = NULL; -} - -/* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. * @@ -1367,6 +1401,82 @@ static struct uprobe_task *get_utask(void) return current->utask; } +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ + struct uprobe_task *n_utask; + struct return_instance **p, *o, *n; + + n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + if (!n_utask) + return -ENOMEM; + t->utask = n_utask; + + p = &n_utask->return_instances; + for (o = o_utask->return_instances; o; o = o->next) { + n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!n) + return -ENOMEM; + + *n = *o; + atomic_inc(&n->uprobe->ref); + n->next = NULL; + + *p = n; + p = &n->next; + n_utask->depth++; + } + + return 0; +} + +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", + current->comm, current->pid, msg); +} + +static void dup_xol_work(struct callback_head *work) +{ + if (current->flags & PF_EXITING) + return; + + if (!__create_xol_area(current->utask->dup_xol_addr)) + uprobe_warn(current, "dup xol area"); +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t, unsigned long flags) +{ + struct uprobe_task *utask = current->utask; + struct mm_struct *mm = current->mm; + struct xol_area *area; + + t->utask = NULL; + + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) + return; + + if (dup_utask(t, utask)) + return uprobe_warn(t, "dup ret instances"); + + /* The task can fork() after dup_xol_work() fails */ + area = mm->uprobes_state.xol_area; + if (!area) + return uprobe_warn(t, "dup xol area"); + + if (mm == t->mm) + return; + + t->utask->dup_xol_addr = area->vaddr; + init_task_work(&t->utask->dup_xol_work, dup_xol_work); + task_work_add(t, &t->utask->dup_xol_work, true); +} + /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. @@ -1744,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* Tracing handlers use ->utask to communicate with fetch methods */ + if (!get_utask()) + goto out; + handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; @@ -1857,9 +1971,4 @@ static int __init init_uprobes(void) return register_die_notifier(&uprobe_exception_nb); } -module_init(init_uprobes); - -static void __exit exit_uprobes(void) -{ -} -module_exit(exit_uprobes); +__initcall(init_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..1e77fc645317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28105bb..763faf037ec1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) static inline int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + addr < (unsigned long)_einittext) return 1; return 0; } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) + addr < (unsigned long)_etext) return 1; if (system_state == SYSTEM_BOOTING && diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6bd..2f11bbe376b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,11 +532,12 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); mm_init_owner(mm, p); + clear_tlb_flush_pending(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -560,7 +561,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,12 +815,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif -#ifdef CONFIG_NUMA_BALANCING - mm->first_nid = NUMA_PTE_SCAN_INIT; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1037,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1089,8 +1092,10 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); + p->pi_waiters = RB_ROOT; + p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; + p->pi_top_task = NULL; #endif } @@ -1174,7 +1179,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * do not allow it to share a thread group or signal handlers or * parent with the forking task. */ - if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if (clone_flags & CLONE_SIGHAND) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) @@ -1313,7 +1318,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p); + retval = sched_fork(clone_flags, p); + if (retval) + goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) @@ -1373,7 +1380,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif - uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ @@ -1406,13 +1412,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->tgid = p->pid; } - p->pdeath_signal = 0; - p->exit_state = 0; - p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; + p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; @@ -1475,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; @@ -1490,6 +1496,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, perf_event_fork(p); trace_task_newtask(p, clone_flags); + uprobe_copy_process(p, clone_flags); return p; diff --git a/kernel/freezer.c b/kernel/freezer.c index b462fa197517..aa6a8aadb911 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; +/* + * Temporary export for the deadlock workaround in ata_scsi_hotplug(). + * Remove once the hack becomes unnecessary. + */ +EXPORT_SYMBOL_GPL(pm_freezing); + /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a5214..44a1261cb9ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,14 +63,101 @@ #include <linux/sched/rt.h> #include <linux/hugetlb.h> #include <linux/freezer.h> +#include <linux/bootmem.h> #include <asm/futex.h> -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * Basic futex operation and ordering guarantees: + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; + * mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `-------> mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * wake_waiters(futex); + * unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read -- this + * is guaranteed by the head counter in the hb spinlock; and where (B) + * orders the write to futex and the waiters read -- this is done by the + * barriers in get_futex_key_refs(), through either ihold or atomic_inc, + * depending on the futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +int __read_mostly futex_cmpxchg_enabled; /* * Futex flags used to encode options to functions and preserve them across @@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = { struct futex_hash_bucket { spinlock_t lock; struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; + +static unsigned long __read_mostly futex_hashsize; + +static struct futex_hash_bucket *futex_queues; + +static inline void futex_get_mm(union futex_key *key) +{ + atomic_inc(&key->private.mm->mm_count); + /* + * Ensure futex_get_mm() implies a full barrier such that + * get_futex_key() implies a full barrier. This is relied upon + * as full barrier (B), see the ordering comment above. + */ + smp_mb__after_atomic_inc(); +} + +static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + /* + * Tasks trying to enter the critical region are most likely + * potential waiters that will be added to the plist. Ensure + * that wakers won't miss to-be-slept tasks in the window between + * the wait call and the actual plist_add. + */ + if (spin_is_locked(&hb->lock)) + return true; + smp_rmb(); /* Make sure we check the lock state first */ -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; + return !plist_head_empty(&hb->chain); +#else + return true; +#endif +} /* * We hash on the keys returned from get_futex_key (see below). @@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; + return &futex_queues[hash & (futex_hashsize - 1)]; } /* @@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); + ihold(key->shared.inode); /* implies MB (B) */ break; case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); + futex_get_mm(key); /* implies MB (B) */ break; } } @@ -251,6 +370,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -259,11 +381,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ return 0; } @@ -288,7 +408,7 @@ again: put_page(page); /* serialize against __split_huge_page_splitting() */ local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { page_head = compound_head(page); /* * page_head is valid pointer but we must pin @@ -370,7 +490,7 @@ again: key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ out: unlock_page(page_head); @@ -597,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up @@ -985,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; int ret; @@ -997,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) goto out; hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; + spin_lock(&hb->lock); - head = &hb->chain; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1018,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); +out_put_key: put_futex_key(&key); out: return ret; @@ -1033,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; struct futex_q *this, *next; int ret, op_ret; @@ -1081,9 +1201,7 @@ retry_private: goto retry; } - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (match_futex (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1096,10 +1214,8 @@ retry_private: } if (op_ret > 0) { - head = &hb2->chain; - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (match_futex (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1269,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, int drop_count = 0, task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; struct futex_q *this, *next; u32 curval2; @@ -1392,8 +1507,7 @@ retry_private: } } - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; @@ -1488,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies MB (A) */ return hb; } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); @@ -1866,7 +1980,7 @@ retry_private: ret = get_futex_value_locked(&uval, uaddr); if (ret) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = get_user(uval, uaddr); if (ret) @@ -1880,7 +1994,7 @@ retry_private: } if (uval != val) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = -EWOULDBLOCK; } @@ -2028,7 +2142,7 @@ retry_private: * Task is exiting and we just wait for the * exit to complete. */ - queue_unlock(&q, hb); + queue_unlock(hb); put_futex_key(&q.key); cond_resched(); goto retry; @@ -2080,7 +2194,7 @@ retry_private: goto out_put_key; out_unlock_put_key: - queue_unlock(&q, hb); + queue_unlock(hb); out_put_key: put_futex_key(&q.key); @@ -2090,7 +2204,7 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - queue_unlock(&q, hb); + queue_unlock(hb); ret = fault_in_user_writeable(uaddr); if (ret) @@ -2112,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; u32 uval, vpid = task_pid_vnr(current); int ret; @@ -2152,9 +2265,7 @@ retry: * Ok, other tasks may need to be woken up - check waiters * and do the wakeup if necessary: */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -2315,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * code while we sleep on uaddr. */ debug_rt_mutex_init_waiter(&rt_waiter); + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); + RB_CLEAR_NODE(&rt_waiter.tree_entry); rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -2733,8 +2846,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, static int __init futex_init(void) { u32 curval; - int i; + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; /* * This will fail and we want it. Some arch implementations do * runtime detection of the futex_atomic_cmpxchg_inatomic() @@ -2748,7 +2874,7 @@ static int __init futex_init(void) if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + for (i = 0; i < futex_hashsize; i++) { plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d4da55d1fb65..d04ce8ac4399 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. +choice + prompt "Specify GCOV format" + depends on GCOV_KERNEL + default GCOV_FORMAT_AUTODETECT + ---help--- + The gcov format is usually determined by the GCC version, but there are + exceptions where format changes are integrated in lower-version GCCs. + In such a case use this option to adjust the format used in the kernel + accordingly. + + If unsure, choose "Autodetect". + +config GCOV_FORMAT_AUTODETECT + bool "Autodetect" + ---help--- + Select this option to use the format that corresponds to your GCC + version. + +config GCOV_FORMAT_3_4 + bool "GCC 3.4 format" + ---help--- + Select this option to use the format defined by GCC 3.4. + +config GCOV_FORMAT_4_7 + bool "GCC 4.7 format" + ---help--- + Select this option to use the format defined by GCC 4.7. + +endchoice + endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index e97ca59e2520..52aa7e8de927 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,33 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o +# if-lt +# Usage VAR := $(call if-lt, $(a), $(b)) +# Returns 1 if (a < b) +if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) + +ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) + cc-ver := 0304 +else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) + cc-ver := 0407 +else +# Use cc-version if available, otherwise set 0 +# +# scripts/Kbuild.include, which contains cc-version function, is not included +# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" +# Meaning cc-ver is empty causing if-lt test to fail with +# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. +# This has no affect on the clean phase, but the error message could be +# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version +# is not available. We can probably move if-lt to Kbuild.include, so it's also +# not defined during clean or to include Kbuild.include in +# scripts/Makefile.clean. But the following workaround seems least invasive. + cc-ver := $(if $(call cc-version),$(call cc-version),0) +endif + +obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o + +ifeq ($(call if-lt, $(cc-ver), 0407),1) + obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o +else + obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o +endif diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9b22d03cc581..f45b75b713c0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -20,7 +20,6 @@ #include <linux/mutex.h> #include "gcov.h" -static struct gcov_info *gcov_info_head; static int gcov_events_enabled; static DEFINE_MUTEX(gcov_lock); @@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info) mutex_lock(&gcov_lock); if (gcov_version == 0) { - gcov_version = info->version; + gcov_version = gcov_info_version(info); /* * Printing gcc's version magic may prove useful for debugging * incompatibility reports. @@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info) * Add new profiling data structure to list and inform event * listener. */ - info->next = gcov_info_head; - gcov_info_head = info; + gcov_info_link(info); if (gcov_events_enabled) gcov_event(GCOV_ADD, info); mutex_unlock(&gcov_lock); @@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_delta); +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + /** * gcov_enable_events - enable event reporting through gcov_event() * @@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta); */ void gcov_enable_events(void) { - struct gcov_info *info; + struct gcov_info *info = NULL; mutex_lock(&gcov_lock); gcov_events_enabled = 1; + /* Perform event callback for previously registered entries. */ - for (info = gcov_info_head; info; info = info->next) + while ((info = gcov_info_next(info))) gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); } @@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) { struct module *mod = data; - struct gcov_info *info; - struct gcov_info *prev; + struct gcov_info *info = NULL; + struct gcov_info *prev = NULL; if (event != MODULE_STATE_GOING) return NOTIFY_OK; mutex_lock(&gcov_lock); - prev = NULL; + /* Remove entries located in module from linked list. */ - for (info = gcov_info_head; info; info = info->next) { + while ((info = gcov_info_next(info))) { if (within(info, mod->module_core, mod->core_size)) { - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; + gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); } else prev = info; } + mutex_unlock(&gcov_lock); return NOTIFY_OK; diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 7a7d2ee96d42..15ff01a76379 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str) unsigned long val; if (kstrtoul(str, 0, &val)) { - pr_warning("invalid gcov_persist parameter '%s'\n", str); + pr_warn("invalid gcov_persist parameter '%s'\n", str); return 0; } gcov_persist = val; @@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name) list_for_each_entry(node, &all_head, all) { info = get_node_info(node); - if (info && (strcmp(info->filename, name) == 0)) + if (info && (strcmp(gcov_info_filename(info), name) == 0)) return node; } @@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, seq = file->private_data; info = gcov_iter_get_info(seq->private); mutex_lock(&node_lock); - node = get_node_by_name(info->filename); + node = get_node_by_name(gcov_info_filename(info)); if (node) { /* Reset counts or remove node for unloaded modules. */ if (node->num_loaded == 0) @@ -365,7 +365,7 @@ static const char *deskew(const char *basename) */ static void add_links(struct gcov_node *node, struct dentry *parent) { - char *basename; + const char *basename; char *target; int num; int i; @@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent) if (!node->links) return; for (i = 0; i < num; i++) { - target = get_link_target(get_node_info(node)->filename, - &gcov_link[i]); + target = get_link_target( + gcov_info_filename(get_node_info(node)), + &gcov_link[i]); if (!target) goto out_err; - basename = strrchr(target, '/'); - if (!basename) + basename = kbasename(target); + if (basename == target) goto out_err; - basename++; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); if (!node->links[i]) @@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, } else node->dentry = debugfs_create_dir(node->name, parent->dentry); if (!node->dentry) { - pr_warning("could not create file\n"); + pr_warn("could not create file\n"); kfree(node); return NULL; } @@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, err_nomem: kfree(node); - pr_warning("out of memory\n"); + pr_warn("out of memory\n"); return NULL; } @@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info) struct gcov_node *parent; struct gcov_node *node; - filename = kstrdup(info->filename, GFP_KERNEL); + filename = kstrdup(gcov_info_filename(info), GFP_KERNEL); if (!filename) return; parent = &root_node; @@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) */ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); if (!loaded_info) { - pr_warning("could not add '%s' (out of memory)\n", - info->filename); + pr_warn("could not add '%s' (out of memory)\n", + gcov_info_filename(info)); return; } memcpy(loaded_info, node->loaded_info, @@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * data set replaces the copy of the last one. */ if (!gcov_info_is_compatible(node->unloaded_info, info)) { - pr_warning("discarding saved data for %s " - "(incompatible version)\n", info->filename); + pr_warn("discarding saved data for %s " + "(incompatible version)\n", + gcov_info_filename(info)); gcov_info_free(node->unloaded_info); node->unloaded_info = NULL; } @@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * The initial one takes precedence. */ if (!gcov_info_is_compatible(node->loaded_info[0], info)) { - pr_warning("could not add '%s' (incompatible " - "version)\n", info->filename); + pr_warn("could not add '%s' (incompatible " + "version)\n", gcov_info_filename(info)); kfree(loaded_info); return; } @@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info) else { node->unloaded_info = gcov_info_dup(info); if (!node->unloaded_info) { - pr_warning("could not save data for '%s' " - "(out of memory)\n", info->filename); + pr_warn("could not save data for '%s' " + "(out of memory)\n", + gcov_info_filename(info)); } } } @@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info) i = get_info_index(node, info); if (i < 0) { - pr_warning("could not remove '%s' (not found)\n", - info->filename); + pr_warn("could not remove '%s' (not found)\n", + gcov_info_filename(info)); return; } if (gcov_persist) @@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) struct gcov_node *node; mutex_lock(&node_lock); - node = get_node_by_name(info->filename); + node = get_node_by_name(gcov_info_filename(info)); switch (action) { case GCOV_ADD: if (node) @@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) if (node) remove_info(node, info); else { - pr_warning("could not remove '%s' (not found)\n", - info->filename); + pr_warn("could not remove '%s' (not found)\n", + gcov_info_filename(info)); } break; } diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb4260033..27bc88a35013 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -21,6 +21,121 @@ #include <linux/vmalloc.h> #include "gcov.h" +#define GCOV_COUNTERS 5 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_fn_info - profiling meta data per function + * @ident: object file-unique function identifier + * @checksum: function checksum + * @n_ctrs: number of values per counter type belonging to this function + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + */ +struct gcov_fn_info { + unsigned int ident; + unsigned int checksum; + unsigned int n_ctrs[0]; +}; + +/** + * struct gcov_ctr_info - profiling data per counter type + * @num: number of counter values for this type + * @values: array of counter values for this type + * @merge: merge function for counter values of this type (unused) + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; + void (*merge)(gcov_type *, unsigned int); +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: time stamp + * @filename: name of the associated gcov data file + * @n_functions: number of instrumented functions + * @functions: function data + * @ctr_mask: mask specifying which counter types are active + * @counts: counter data per counter type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + unsigned int n_functions; + const struct gcov_fn_info *functions; + unsigned int ctr_mask; + struct gcov_ctr_info counts[0]; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return gcov_info_head; + + return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + info->next = gcov_info_head; + gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c new file mode 100644 index 000000000000..2c6e4631c814 --- /dev/null +++ b/kernel/gcov/gcc_4_7.c @@ -0,0 +1,560 @@ +/* + * This code provides functions to handle gcc's profiling data format + * introduced with gcc 4.7. + * + * This file is based heavily on gcc_3_4.c file. + * + * For a better understanding, refer to gcc source: + * gcc/gcov-io.h + * libgcc/libgcov.c + * + * Uses gcc-internal data definitions. + */ + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +#define GCOV_COUNTERS 8 +#define GCOV_TAG_FUNCTION_LENGTH 3 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_ctr_info - information about counters for a single function + * @num: number of counter values for this type + * @values: array of counter values for this type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; +}; + +/** + * struct gcov_fn_info - profiling meta data per function + * @key: comdat key + * @ident: unique ident of function + * @lineno_checksum: function lineo_checksum + * @cfg_checksum: function cfg checksum + * @ctrs: instrumented counters + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + * + * Information about a single function. This uses the trailing array + * idiom. The number of counters is determined from the merge pointer + * array in gcov_info. The key is used to detect which of a set of + * comdat functions was selected -- it points to the gcov_info object + * of the object file containing the selected comdat function. + */ +struct gcov_fn_info { + const struct gcov_info *key; + unsigned int ident; + unsigned int lineno_checksum; + unsigned int cfg_checksum; + struct gcov_ctr_info ctrs[0]; +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: uniquifying time stamp + * @filename: name of the associated gcov data file + * @merge: merge functions (null for unused counter type) + * @n_functions: number of instrumented functions + * @functions: pointer to pointers to function information + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); + unsigned int n_functions; + struct gcov_fn_info **functions; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return gcov_info_head; + + return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + info->next = gcov_info_head; + gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/* + * Determine whether a counter is active. Doesn't change at run-time. + */ +static int counter_active(struct gcov_info *info, unsigned int type) +{ + return info->merge[type] ? 1 : 0; +} + +/* Determine number of active counters. Based on gcc magic. */ +static unsigned int num_counter_active(struct gcov_info *info) +{ + unsigned int i; + unsigned int result = 0; + + for (i = 0; i < GCOV_COUNTERS; i++) { + if (counter_active(info, i)) + result++; + } + return result; +} + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_ctr_info *ci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + ci_ptr = info->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(info, ct_idx)) + continue; + + memset(ci_ptr->values, 0, + sizeof(gcov_type) * ci_ptr->num); + ci_ptr++; + } + } +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + return (info1->stamp == info2->stamp); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_ctr_info *dci_ptr; + struct gcov_ctr_info *sci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + unsigned int val_idx; + + for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) { + dci_ptr = dst->functions[fi_idx]->ctrs; + sci_ptr = src->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(src, ct_idx)) + continue; + + for (val_idx = 0; val_idx < sci_ptr->num; val_idx++) + dci_ptr->values[val_idx] += + sci_ptr->values[val_idx]; + + dci_ptr++; + sci_ptr++; + } + } +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_ctr_info *dci_ptr; /* dst counter info */ + struct gcov_ctr_info *sci_ptr; /* src counter info */ + unsigned int active; + unsigned int fi_idx; /* function info idx */ + unsigned int ct_idx; /* counter type idx */ + size_t fi_size; /* function info size */ + size_t cv_size; /* counter values size */ + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + + dup->next = NULL; + dup->filename = NULL; + dup->functions = NULL; + + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err_free; + + dup->functions = kcalloc(info->n_functions, + sizeof(struct gcov_fn_info *), GFP_KERNEL); + if (!dup->functions) + goto err_free; + + active = num_counter_active(info); + fi_size = sizeof(struct gcov_fn_info); + fi_size += sizeof(struct gcov_ctr_info) * active; + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL); + if (!dup->functions[fi_idx]) + goto err_free; + + *(dup->functions[fi_idx]) = *(info->functions[fi_idx]); + + sci_ptr = info->functions[fi_idx]->ctrs; + dci_ptr = dup->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < active; ct_idx++) { + + cv_size = sizeof(gcov_type) * sci_ptr->num; + + dci_ptr->values = vmalloc(cv_size); + + if (!dci_ptr->values) + goto err_free; + + dci_ptr->num = sci_ptr->num; + memcpy(dci_ptr->values, sci_ptr->values, cv_size); + + sci_ptr++; + dci_ptr++; + } + } + + return dup; +err_free: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + unsigned int active; + unsigned int fi_idx; + unsigned int ct_idx; + struct gcov_ctr_info *ci_ptr; + + if (!info->functions) + goto free_info; + + active = num_counter_active(info); + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + if (!info->functions[fi_idx]) + continue; + + ci_ptr = info->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) + vfree(ci_ptr->values); + + kfree(info->functions[fi_idx]); + } + +free_info: + kfree(info->functions); + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + struct gcov_ctr_info *ci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + unsigned int cv_idx; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->stamp); + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + fi_ptr = info->functions[fi_idx]; + + /* Function record. */ + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + ci_ptr = fi_ptr->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(info, ct_idx)) + continue; + + /* Counter record. */ + pos += store_gcov_u32(buffer, pos, + GCOV_TAG_FOR_COUNTER(ct_idx)); + pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + + for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { + pos += store_gcov_u64(buffer, pos, + ci_ptr->values[cv_idx]); + } + + ci_ptr++; + } + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a6..92c8e22a29ed 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -21,7 +21,6 @@ * gcc and need to be kept as close to the original definition as possible to * remain compatible. */ -#define GCOV_COUNTERS 5 #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) @@ -34,60 +33,18 @@ typedef long gcov_type; typedef long long gcov_type; #endif -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[0]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; +/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so + * we cannot use full definition here and they need to be placed in gcc specific + * implementation of gcov. This also means no direct access to the members in + * generic code and usage of the interface below.*/ +struct gcov_info; -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; -}; +/* Interface to access gcov_info data */ +const char *gcov_info_filename(struct gcov_info *info); +unsigned int gcov_info_version(struct gcov_info *info); +struct gcov_info *gcov_info_next(struct gcov_info *info); +void gcov_info_link(struct gcov_info *info); +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); /* Base interface. */ enum gcov_action { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..09094361dce5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -46,6 +46,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -1610,7 +1611,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, unsigned long slack; slack = current->timer_slack_ns; - if (rt_task(current)) + if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3e97fb126e6b..9328b80eaf14 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -16,11 +16,12 @@ #include <linux/export.h> #include <linux/sysctl.h> #include <linux/utsname.h> +#include <trace/events/sched.h> /* * The number of tasks checked: */ -unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Limit number of tasks checked in a batch. @@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) t->last_switch_count = switch_count; return; } + + trace_sched_process_hang(t); + if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ + atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); + /* * kthread which checks for tasks stuck in D state */ @@ -216,6 +228,9 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; + if (atomic_xchg(&reset_hung_task, 0)) + continue; + check_hung_uninterruptible_tasks(timeout); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3bb14fbe5c6..dc04c166c54d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc) } /** - * irq_disable - Mark interupt disabled + * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 706724e9835d..cf68bb36fe58 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; unsigned int virq; - domain = controller ? irq_find_host(controller) : irq_default_domain; + domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(controller)); + of_node_full_name(irq_data->np)); return 0; } /* If domain has no translation, then we assume interrupt line */ if (domain->ops->xlate == NULL) - hwirq = intspec[0]; + hwirq = irq_data->args[0]; else { - if (domain->ops->xlate(domain, controller, intspec, intsize, - &hwirq, &type)) + if (domain->ops->xlate(domain, irq_data->np, irq_data->args, + irq_data->args_count, &hwirq, &type)) return 0; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..481a13c43b17 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) } /* - * Interrupts explicitely requested as threaded interupts want to be + * Interrupts explicitly requested as threaded interrupts want to be * preemtible - many of them need to sleep and wait for slow busses to * complete. */ @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_mput; } - sched_setscheduler(t, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); /* * We keep the reference to the task struct even if diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf21760..abcd6ca86cb7 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (is_early != want_early) + if (!is_early && want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 1162f1030f18..3320b84cc60f 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,6 +14,7 @@ enum { _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, + _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -26,6 +27,7 @@ enum { #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON +#define IRQ_IS_POLLED GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NESTED_THREAD; } + +static inline bool irq_settings_is_polled(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_IS_POLLED; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7b5f012bde9d..a1d8cc63b56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); - /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + /* + * PER_CPU, nested thread interrupts and interrupts explicitely + * marked polled are excluded from polling. + */ + if (irq_settings_is_per_cpu(desc) || + irq_settings_is_nested_thread(desc) || + irq_settings_is_polled(desc)) goto out; /* @@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->istate & IRQS_POLL_INPROGRESS) + if (desc->istate & IRQS_POLL_INPROGRESS || + irq_settings_is_polled(desc)) return; /* we get here again via the threaded handler */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 297a9247a3b3..9019f15deab2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable); void static_key_slow_inc(struct static_key *key) { + STATIC_KEY_CHECK_USE(); if (atomic_inc_not_zero(&key->enabled)) return; @@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { + STATIC_KEY_CHECK_USE(); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { + STATIC_KEY_CHECK_USE(); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); @@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { + STATIC_KEY_CHECK_USE(); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } @@ -212,6 +216,7 @@ void __init jump_label_init(void) key->next = NULL; #endif } + static_key_initialized = true; jump_label_unlock(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a74f307c5ec..9c970167e402 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; size_t vmcoreinfo_size; size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -921,7 +924,7 @@ static int kimage_load_segment(struct kimage *image, * reinitialize them. * * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And + * and then copies the image to it's final destination. And * jumps into the image at entry. * * kexec does not sync, or unmount filesystems so if you need @@ -1675,7 +1678,9 @@ int kernel_kexec(void) } else #endif { + kexec_in_progress = true; kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a0d367a49122..ceeadfcabb76 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2066,7 +2066,7 @@ static int __init init_kprobes(void) { int i, err = 0; unsigned long offset = 0, size = 0; - char *modname, namebuf[128]; + char *modname, namebuf[KSYM_NAME_LEN]; const char *symbol_name; void *addr; struct kprobe_blackpoint *kb; @@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) const char *sym = NULL; unsigned int i = *(loff_t *) v; unsigned long offset = 0; - char *modname, namebuf[128]; + char *modname, namebuf[KSYM_NAME_LEN]; head = &kprobe_table[i]; preempt_disable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 760e86df8c20..b5ae3ee860a9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -33,7 +33,7 @@ struct kthread_create_info /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; - struct completion done; + struct completion *done; struct list_head list; }; @@ -178,6 +178,7 @@ static int kthread(void *_create) struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; + struct completion *done; struct kthread self; int ret; @@ -187,10 +188,16 @@ static int kthread(void *_create) init_completion(&self.parked); current->vfork_done = &self.exited; + /* If user was SIGKILLed, I release the structure. */ + done = xchg(&create->done, NULL); + if (!done) { + kfree(create); + do_exit(-EINTR); + } /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; - complete(&create->done); + complete(done); schedule(); ret = -EINTR; @@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { + /* If user was SIGKILLed, I release the structure. */ + struct completion *done = xchg(&create->done, NULL); + + if (!done) { + kfree(create); + return; + } create->result = ERR_PTR(pid); - complete(&create->done); + complete(done); } } @@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), const char namefmt[], ...) { - struct kthread_create_info create; - - create.threadfn = threadfn; - create.data = data; - create.node = node; - init_completion(&create.done); + DECLARE_COMPLETION_ONSTACK(done); + struct task_struct *task; + struct kthread_create_info *create = kmalloc(sizeof(*create), + GFP_KERNEL); + + if (!create) + return ERR_PTR(-ENOMEM); + create->threadfn = threadfn; + create->data = data; + create->node = node; + create->done = &done; spin_lock(&kthread_create_lock); - list_add_tail(&create.list, &kthread_create_list); + list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); - wait_for_completion(&create.done); - - if (!IS_ERR(create.result)) { + /* + * Wait for completion in killable state, for I might be chosen by + * the OOM killer while kthreadd is trying to allocate memory for + * new kernel thread. + */ + if (unlikely(wait_for_completion_killable(&done))) { + /* + * If I was SIGKILLed before kthreadd (or new kernel thread) + * calls complete(), leave the cleanup of this structure to + * that thread. + */ + if (xchg(&create->done, NULL)) + return ERR_PTR(-ENOMEM); + /* + * kthreadd (or new kernel thread) will call complete() + * shortly. + */ + wait_for_completion(&done); + } + task = create->result; + if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); - vsnprintf(create.result->comm, sizeof(create.result->comm), - namefmt, args); + vsnprintf(task->comm, sizeof(task->comm), namefmt, args); va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. */ - sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(create.result, cpu_all_mask); + sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(task, cpu_all_mask); } - return create.result; + kfree(create); + return task; } EXPORT_SYMBOL(kthread_create_on_node); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 000000000000..baab8e5e7f66 --- /dev/null +++ b/kernel/locking/Makefile @@ -0,0 +1,25 @@ + +obj-y += mutex.o semaphore.o rwsem.o lglock.o + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +endif + +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o +obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/kernel/lglock.c b/kernel/locking/lglock.c index 86ae2aebf004..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/locking/lglock.c diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c index e16c45b9ee77..eb8a54783fa0 100644 --- a/kernel/lockdep.c +++ b/kernel/locking/lockdep.c @@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class) /* * Is this the address of a static object: */ +#ifdef __KERNEL__ static int static_obj(void *obj) { unsigned long start = (unsigned long) &_stext, @@ -616,6 +617,7 @@ static int static_obj(void *obj) */ return is_module_address(addr) || is_module_percpu_address(addr); } +#endif /* * To make lock name printouts unique, we calculate a unique @@ -1232,7 +1234,7 @@ static int noop_count(struct lock_list *entry, void *data) return 0; } -unsigned long __lockdep_count_forward_deps(struct lock_list *this) +static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); @@ -1258,7 +1260,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) return ret; } -unsigned long __lockdep_count_backward_deps(struct lock_list *this) +static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; struct lock_list *uninitialized_var(target_entry); @@ -4115,6 +4117,7 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); +#ifdef __KERNEL__ void debug_show_all_locks(void) { struct task_struct *g, *p; @@ -4172,6 +4175,7 @@ retry: read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); +#endif /* * Careful: only use this function if you are sure that @@ -4224,7 +4228,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : rcu_is_cpu_idle() + : !rcu_is_watching() ? "RCU used illegally from idle CPU!\n" : "", rcu_scheduler_active, debug_locks); @@ -4247,7 +4251,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * So complain bitterly if someone does call rcu_read_lock(), * rcu_read_lock_bh() and so on from extended quiescent states. */ - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) printk("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..4f560cfedc8f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b2c71c5873e4..ef43ac4bafb5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); + seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) @@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) } if (i) { seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); + seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); seq_puts(m, "\n"); } } static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.3\n"); + seq_puts(m, "lock_stat version 0.4\n"); if (unlikely(!debug_locks)) seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", "class name", "con-bounces", @@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m) "waittime-min", "waittime-max", "waittime-total", + "waittime-avg", "acq-bounces", "acquisitions", "holdtime-min", "holdtime-max", - "holdtime-total"); - seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + "holdtime-total", + "holdtime-avg"); + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); seq_printf(m, "\n"); } diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..995b0cc2b84c 100644 --- a/kernel/lockdep_states.h +++ b/kernel/locking/lockdep_states.h diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..faf6f5b53e77 100644 --- a/kernel/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -75,7 +75,12 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); + + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/locking/mutex-debug.h diff --git a/kernel/mutex.c b/kernel/locking/mutex.c index d24105b1b794..4dd6e4c219de 100644 --- a/kernel/mutex.c +++ b/kernel/locking/mutex.c @@ -1,5 +1,5 @@ /* - * kernel/mutex.c + * kernel/locking/mutex.c * * Mutexes: blocking mutual exclusion locks * diff --git a/kernel/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/locking/mutex.h diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/kernel/locking/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/percpu.h> +#include <linux/wait.h> +#include <linux/lockdep.h> +#include <linux/percpu-rwsem.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/errno.h> + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, + const char *name, struct lock_class_key *rwsem_key) +{ + brw->fast_read_ctr = alloc_percpu(int); + if (unlikely(!brw->fast_read_ctr)) + return -ENOMEM; + + /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + __init_rwsem(&brw->rw_sem, name, rwsem_key); + atomic_set(&brw->write_ctr, 0); + atomic_set(&brw->slow_read_ctr, 0); + init_waitqueue_head(&brw->write_waitq); + return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ + free_percpu(brw->fast_read_ctr); + brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + * R_W: down_write() comes after up_read(), the writer should see all + * changes done by the reader + * or + * W_R: down_read() comes after up_write(), the reader should see all + * changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ + bool success = false; + + preempt_disable(); + if (likely(!atomic_read(&brw->write_ctr))) { + __this_cpu_add(*brw->fast_read_ctr, val); + success = true; + } + preempt_enable(); + + return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ + might_sleep(); + if (likely(update_fast_ctr(brw, +1))) { + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + return; + } + + down_read(&brw->rw_sem); + atomic_inc(&brw->slow_read_ctr); + /* avoid up_read()->rwsem_release() */ + __up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ + rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + + if (likely(update_fast_ctr(brw, -1))) + return; + + /* false-positive is possible but harmless */ + if (atomic_dec_and_test(&brw->slow_read_ctr)) + wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ + unsigned int sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + sum += per_cpu(*brw->fast_read_ctr, cpu); + per_cpu(*brw->fast_read_ctr, cpu) = 0; + } + + return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ + /* tell update_fast_ctr() there is a pending writer */ + atomic_inc(&brw->write_ctr); + /* + * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read + * so that update_fast_ctr() can't succeed. + * + * 2. Ensures we see the result of every previous this_cpu_add() in + * update_fast_ctr(). + * + * 3. Ensures that if any reader has exited its critical section via + * fast-path, it executes a full memory barrier before we return. + * See R_W case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + + /* exclude other writers, and block the new readers completely */ + down_write(&brw->rw_sem); + + /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ + atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + + /* wait for all readers to complete their percpu_up_read() */ + wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ + /* release the lock, but the readers can't use the fast-path */ + up_write(&brw->rw_sem); + /* + * Insert the barrier before the next fast-path in down_read, + * see W_R case in the comment above update_fast_ctr(). + */ + synchronize_sched_expedited(); + /* the last writer unblocks update_fast_ctr() */ + atomic_dec(&brw->write_ctr); +} diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -24,7 +24,7 @@ #include <linux/kallsyms.h> #include <linux/syscalls.h> #include <linux/interrupt.h> -#include <linux/plist.h> +#include <linux/rbtree.h> #include <linux/fs.h> #include <linux/debug_locks.h> @@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } @@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..14193d596d78 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c index 1d96dd0d93c1..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/locking/rtmutex-tester.c diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..2e960a2bab81 100644 --- a/kernel/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -14,6 +14,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include "rtmutex_common.h" @@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +static inline int +rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio < right->prio) + return 1; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 1 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return (left->task->dl.deadline < right->task->dl.deadline); + + return 0; +} + +static void +rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &lock->waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + lock->waiters_leftmost = &waiter->tree_entry; + + rb_link_node(&waiter->tree_entry, parent, link); + rb_insert_color(&waiter->tree_entry, &lock->waiters); +} + +static void +rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->tree_entry)) + return; + + if (lock->waiters_leftmost == &waiter->tree_entry) + lock->waiters_leftmost = rb_next(&waiter->tree_entry); + + rb_erase(&waiter->tree_entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree_entry); +} + +static void +rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + task->pi_waiters_leftmost = &waiter->pi_tree_entry; + + rb_link_node(&waiter->pi_tree_entry, parent, link); + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); +} + +static void +rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + return; + + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree_entry); +} + /* - * Calculate task priority from the waiter list priority + * Calculate task priority from the waiter tree priority * - * Return task->normal_prio when the waiter list is empty or when + * Return task->normal_prio when the waiter tree is empty or when * the waiter is not allowed to do priority boosting */ int rt_mutex_getprio(struct task_struct *task) @@ -102,10 +200,18 @@ int rt_mutex_getprio(struct task_struct *task) if (likely(!task_has_pi_waiters(task))) return task->normal_prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, + return min(task_top_pi_waiter(task)->prio, task->normal_prio); } +struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return NULL; + + return task_top_pi_waiter(task)->task; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * @@ -115,7 +221,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); - if (task->prio != prio) + if (task->prio != prio || dl_prio(prio)) rt_mutex_setprio(task, prio); } @@ -233,7 +339,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * When deadlock detection is off then we check, if further * priority adjustment is necessary. */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) + if (!detect_deadlock && waiter->prio == task->prio) goto out_unlock_pi; lock = waiter->lock; @@ -254,9 +360,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); + waiter->prio = task->prio; + rt_mutex_enqueue(lock, waiter); /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -280,17 +386,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, top_waiter); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } else if (top_waiter == waiter) { /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } @@ -355,7 +459,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (task->prio >= rt_mutex_top_waiter(lock)->prio) { if (!waiter || waiter != rt_mutex_top_waiter(lock)) return 0; } @@ -369,7 +473,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* remove the queued waiter. */ if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); task->pi_blocked_on = NULL; } @@ -379,8 +483,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) { top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, top); } raw_spin_unlock_irqrestore(&task->pi_lock, flags); } @@ -416,13 +519,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); + waiter->prio = task->prio; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_enqueue(lock, waiter); task->pi_blocked_on = waiter; @@ -433,8 +535,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, top_waiter); + rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) @@ -486,7 +588,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * boosted mode and go back to normal after releasing * lock->wait_lock. */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + rt_mutex_dequeue_pi(current, waiter); rt_mutex_set_owner(lock, NULL); @@ -510,7 +612,7 @@ static void remove_waiter(struct rt_mutex *lock, int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); @@ -521,13 +623,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, waiter); if (rt_mutex_has_waiters(lock)) { struct rt_mutex_waiter *next; next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); + rt_mutex_enqueue_pi(owner, next); } __rt_mutex_adjust_prio(owner); @@ -537,8 +639,6 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!chain_walk) return; @@ -565,7 +665,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { + if (!waiter || (waiter->prio == task->prio && + !dl_prio(task->prio))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -638,6 +739,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); + RB_CLEAR_NODE(&waiter.pi_tree_entry); + RB_CLEAR_NODE(&waiter.tree_entry); raw_spin_lock(&lock->wait_lock); @@ -904,7 +1007,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); + lock->waiters = RB_ROOT; + lock->waiters_leftmost = NULL; debug_rt_mutex_init(lock, name); } diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..a1a1dd06421d 100644 --- a/kernel/rtmutex.h +++ b/kernel/locking/rtmutex.h diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @tree_entry: pi node to enqueue into the mutex waiters tree + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; + struct rb_node tree_entry; + struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -54,14 +54,15 @@ struct rt_mutex_waiter { struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif + int prio; }; /* - * Various helpers to access the waiters-plist: + * Various helpers to access the waiters-tree: */ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !plist_head_empty(&lock->wait_list); + return !RB_EMPTY_ROOT(&lock->waiters); } static inline struct rt_mutex_waiter * @@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, - list_entry); + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, + tree_entry); BUG_ON(w->lock != lock); return w; @@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !plist_head_empty(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, - pi_list_entry); + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, + pi_tree_entry); } /* diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/export.h> + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ + int ret = 1; + unsigned long flags; + + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { + ret = (sem->activity != 0); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + } + return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->activity = 0; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + * - the 'active count' _reached_ zero + * - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + int woken; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); + goto out; + } + + /* grant an infinite number of read locks to the front of the queue */ + woken = 0; + do { + struct list_head *next = waiter->list.next; + + list_del(&waiter->list); + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + woken++; + if (next == &sem->wait_list) + break; + waiter = list_entry(next, struct rwsem_waiter, list); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + sem->activity += woken; + + out: + return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ + struct rwsem_waiter *waiter; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + wake_up_process(waiter->task); + + return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + goto out; + } + + tsk = current; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we don't need to touch the semaphore struct anymore */ + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + + tsk->state = TASK_RUNNING; + out: + ; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* set up my own style of waitqueue */ + tsk = current; + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + list_add_tail(&waiter.list, &sem->wait_list); + + /* wait for someone to release the lock */ + for (;;) { + /* + * That is the key to support write lock stealing: allows the + * task already on CPU to get the lock soon rather than put + * itself into sleep and waiting for system woke it or someone + * else in the head of the wait list up. + */ + if (sem->activity == 0) + break; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + schedule(); + raw_spin_lock_irqsave(&sem->wait_lock, flags); + } + /* got the lock */ + sem->activity = -1; + list_del(&waiter.list); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity == 0) { + /* got the lock */ + sem->activity = -1; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + sem = __rwsem_wake_one_writer(sem); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 0; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 1); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 1; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 0); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,293 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/export.h> + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->count = RWSEM_UNLOCKED_VALUE; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + struct list_head *next; + long oldcount, woken, loop, adjustment; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); + goto out; + } + + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } + + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. + */ + woken = 0; + do { + woken++; + + if (waiter->list.next == &sem->wait_list) + break; + + waiter = list_entry(waiter->list.next, + struct rwsem_waiter, list); + + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (waiter->type != RWSEM_WAITING_FOR_WRITE) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + + if (adjustment) + rwsem_atomic_add(adjustment, sem); + + next = sem->wait_list.next; + loop = woken; + do { + waiter = list_entry(next, struct rwsem_waiter, list); + next = waiter->list.next; + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + } while (--loop); + + sem->wait_list.next = next; + next->prev = &sem->wait_list; + + out: + return sem; +} + +/* + * wait for the read lock to be granted + */ +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * wait until we successfully acquire the write lock + */ +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (true) { + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/locking/rwsem.c diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/locking/semaphore.c diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/locking/spinlock.c diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include <linux/spinlock.h> +#include <linux/nmi.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/delay.h> +#include <linux/export.h> + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + lock->magic = SPINLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; + lock->magic = RWLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ + struct task_struct *owner = NULL; + + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) + owner = lock->owner; + printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", + msg, raw_smp_processor_id(), + current->comm, task_pid_nr(current)); + printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d\n", + lock, lock->magic, + owner ? owner->comm : "<none>", + owner ? task_pid_nr(owner) : -1, + lock->owner_cpu); + dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); + SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); + SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); + SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + + for (i = 0; i < loops; i++) { + if (arch_spin_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif + + /* + * The trylock above was causing a livelock. Give the lower level arch + * specific lock code a chance to acquire the lock. We have already + * printed a warning/backtrace at this point. The non-debug arch + * specific code might actually succeed in acquiring the lock. If it is + * not successful, the end-result is the same - there is no forward + * progress. + */ + arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ + debug_spin_lock_before(lock); + if (unlikely(!arch_spin_trylock(&lock->raw_lock))) + __spin_lock_debug(lock); + debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ + int ret = arch_spin_trylock(&lock->raw_lock); + + if (ret) + debug_spin_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ + debug_spin_unlock(lock); + arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", + msg, raw_smp_processor_id(), current->comm, + task_pid_nr(current), lock); + dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_read_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ + int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); + RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); + RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +#if 0 /* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_write_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ + debug_write_lock_before(lock); + arch_write_lock(&lock->raw_lock); + debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ + int ret = arch_write_trylock(&lock->raw_lock); + + if (ret) + debug_write_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); +} diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 4a9a86d12c8b..000000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,12 +0,0 @@ -#include <linux/export.h> - -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - - .section ".init.data","aw" - -GLOBAL(modsign_certificate_list) - .incbin "signing_key.x509" - .incbin "extra_certificates" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e6..000000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ - pr_notice("Initialise module verification\n"); - - modsign_keyring = keyring_alloc(".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(modsign_keyring)) - panic("Can't allocate module signing keyring\n"); - - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading module verification certificates\n"); - - end = modsign_certificate_list_end; - p = modsign_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(modsign_keyring, 1), - "asymmetric", - NULL, - p, - plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(key)) - pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - else - pr_notice("MODSIGN: Loaded cert '%s'\n", - key_ref_to_ptr(key)->description); - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d02..915e123a430f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@ * 2 of the Licence, or (at your option) any later version. */ -extern struct key *modsign_keyring; - extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index dc582749fa13..d24fcf29cb64 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms, if (syms->licence == GPL_ONLY) return false; if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", fsa->name); + pr_warn("Symbol %s is being used by a non-GPL module, " + "which will not be allowed in the future\n", + fsa->name); } } #ifdef CONFIG_UNUSED_SYMBOLS if (syms->unused && fsa->warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", fsa->name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); + pr_warn("Symbol %s is marked as UNUSED, however this module is " + "using it.\n", fsa->name); + pr_warn("This symbol will go away in the future.\n"); + pr_warn("Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); } #endif @@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info) return 0; if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - mod->name, align, PAGE_SIZE); + pr_warn("%s: per-cpu alignment %li > %li\n", + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { - printk(KERN_WARNING - "%s: Could not allocate %lu bytes percpu data\n", - mod->name, (unsigned long)pcpusec->sh_size); + pr_warn("%s: Could not allocate %lu bytes percpu data\n", + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } mod->percpu_size = pcpusec->sh_size; @@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod) /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; return 0; } @@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { - printk(KERN_WARNING "%s: out of memory loading\n", a->name); + pr_warn("%s: out of memory loading\n", a->name); return -ENOMEM; } @@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; + struct stopref sref = { mod, flags, forced }; - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } + return stop_machine(__try_stop_module, &sref, NULL); } unsigned long module_refcount(struct module *mod) @@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount); /* This exists whether we can unload or not */ static void free_module(struct module *mod); -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - pr_debug("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -842,6 +815,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + if (!(flags & O_NONBLOCK)) + pr_warn("waiting module removal not supported: please upgrade\n"); + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -859,8 +835,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Doing init or already dying? */ if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ + /* FIXME: if (force), slam module count damn the torpedoes */ pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; @@ -876,18 +851,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Set this up before setting mod->state */ - mod->waiter = current; - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - mutex_unlock(&module_mutex); /* Final destruction now no one is using it. */ if (mod->exit != NULL) @@ -1005,9 +973,6 @@ void module_put(struct module *module) __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); preempt_enable(); } } @@ -1145,8 +1110,7 @@ static int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) - printk(KERN_WARNING "%s: %s: kernel tainted.\n", - mod->name, reason); + pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else @@ -1199,8 +1163,7 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - printk(KERN_WARNING "%s: no symbol version for %s\n", - mod->name, symname); + pr_warn("%s: no symbol version for %s\n", mod->name, symname); return 0; bad_version: @@ -1309,8 +1272,8 @@ resolve_symbol_wait(struct module *mod, !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { - printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, owner); + pr_warn("%s: gave up waiting for init of module %s.\n", + mod->name, owner); } return ksym; } @@ -1626,15 +1589,14 @@ static int mod_sysfs_init(struct module *mod) struct kobject *kobj; if (!module_sysfs_initialized) { - printk(KERN_ERR "%s: module sysfs not initialized\n", - mod->name); + pr_err("%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; } kobj = kset_find_obj(module_kset, mod->name); if (kobj) { - printk(KERN_ERR "%s: module is already loaded\n", mod->name); + pr_err("%s: module is already loaded\n", mod->name); kobject_put(kobj); err = -EINVAL; goto out; @@ -1961,8 +1923,7 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(s->name, &owner, NULL, true, false)) { - printk(KERN_ERR - "%s: exports duplicate symbol %s" + pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, s->name, module_name(owner)); return -ENOEXEC; @@ -2013,8 +1974,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); + pr_warn("%s: Unknown symbol %s (err %li)\n", + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; @@ -2168,8 +2129,8 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!test_taint(TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints " - "kernel.\n", mod->name, license); + pr_warn("%s: module license '%s' taints kernel.\n", + mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); } @@ -2405,8 +2366,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) - printk(KERN_ERR "dynamic debug error adding module: %s\n", - debug->modname); + pr_err("dynamic debug error adding module: %s\n", + debug->modname); #endif } @@ -2619,8 +2580,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) Elf_Shdr *shdr = &info->sechdrs[i]; if (shdr->sh_type != SHT_NOBITS && info->len < shdr->sh_offset + shdr->sh_size) { - printk(KERN_ERR "Module len %lu truncated\n", - info->len); + pr_err("Module len %lu truncated\n", info->len); return -ENOEXEC; } @@ -2682,15 +2642,14 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - printk(KERN_WARNING "No module found in object\n"); + pr_warn("No module found in object\n"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; if (info->index.sym == 0) { - printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", - mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", mod->name); return ERR_PTR(-ENOEXEC); } @@ -2717,7 +2676,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (err) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { - printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", + pr_err("%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); return -ENOEXEC; } @@ -2727,9 +2686,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); - printk(KERN_WARNING "%s: module is from the staging directory," - " the quality is unknown, you have been warned.\n", - mod->name); + pr_warn("%s: module is from the staging directory, the quality " + "is unknown, you have been warned.\n", mod->name); } /* Set up license info based on the info section */ @@ -2738,7 +2696,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return 0; } -static void find_module_sections(struct module *mod, struct load_info *info) +static int find_module_sections(struct module *mod, struct load_info *info) { mod->kp = section_objs(info, "__param", sizeof(*mod->kp), &mod->num_kp); @@ -2768,6 +2726,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); + if (!mod->ctors) + mod->ctors = section_objs(info, ".init_array", + sizeof(*mod->ctors), &mod->num_ctors); + else if (find_sec(info, ".init_array")) { + /* + * This shouldn't happen with same compiler and binutils + * building all parts of the module. + */ + printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + mod->name); + return -EINVAL; + } #endif #ifdef CONFIG_TRACEPOINTS @@ -2801,11 +2771,12 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->extable), &mod->num_exentries); if (section_addr(info, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); + pr_warn("%s: Ignoring obsolete parameters\n", mod->name); info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); + + return 0; } static int move_module(struct module *mod, struct load_info *info) @@ -3078,11 +3049,10 @@ static int do_init_module(struct module *mod) return ret; } if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); + pr_warn("%s: '%s'->init suspiciously returned %d, it should " + "follow 0/-E convention\n" + "%s: loading module anyway...\n", + __func__, mod->name, ret, __func__); dump_stack(); } @@ -3205,10 +3175,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname) { /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); - if (ret != 0) { - printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", - modname, param); - } + if (ret != 0) + pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); return 0; } @@ -3243,10 +3211,9 @@ static int load_module(struct load_info *info, const char __user *uargs, #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) { - printk_once(KERN_NOTICE - "%s: module verification failed: signature and/or" - " required key missing - tainting kernel\n", - mod->name); + pr_notice_once("%s: module verification failed: signature " + "and/or required key missing - tainting " + "kernel\n", mod->name); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); } #endif @@ -3263,7 +3230,9 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, info); + err = find_module_sections(mod, info); + if (err) + goto free_unload; err = check_module_license_and_versions(mod); if (err) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5ea..be5b8fac4bd0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@ #include <crypto/public_key.h> #include <crypto/hash.h> #include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> #include "module-internal.h" /* @@ -28,7 +29,7 @@ */ struct module_signature { u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 hash; /* Digest algorithm [enum hash_algo] */ u8 id_type; /* Key identifier type [enum pkey_id_type] */ u8 signer_len; /* Length of signer's name */ u8 key_id_len; /* Length of key identifier */ @@ -39,7 +40,7 @@ struct module_signature { /* * Digest the module contents. */ -static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +static struct public_key_signature *mod_make_digest(enum hash_algo hash, const void *mod, unsigned long modlen) { @@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(modsign_keyring, 1), + key = keyring_search(make_key_ref(system_trusted_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", @@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !pkey_hash_algo[ms.hash]) + !hash_algo_name[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c95dcfe..2abd25d79cc8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) static int padata_cpu_hash(struct parallel_data *pd) { + unsigned int seq_nr; int cpu_index; /* @@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd) * seq_nr mod. number of cpus in use. */ - spin_lock(&pd->seq_lock); - cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); - pd->seq_nr++; - spin_unlock(&pd->seq_lock); + seq_nr = atomic_inc_return(&pd->seq_nr); + cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - pd->seq_nr = 0; + atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; diff --git a/kernel/panic.c b/kernel/panic.c index b6c482ccc5db..6d6300375090 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,7 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -233,7 +233,7 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; if (tainted_mask) { char *s; diff --git a/kernel/params.c b/kernel/params.c index c00d5b502aa4..b00142e7f3ba 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -227,17 +227,10 @@ int parse_args(const char *doing, } /* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ +#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ - tmptype l; \ - int ret; \ - \ - ret = strtolfn(val, 0, &l); \ - if (ret < 0 || ((type)l != l)) \ - return ret < 0 ? ret : -EINVAL; \ - *((type *)kp->arg) = l; \ - return 0; \ + return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ @@ -253,13 +246,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 42086551a24a..06c62de9c711 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -132,6 +132,12 @@ out: return ERR_PTR(err); } +static void delayed_free_pidns(struct rcu_head *p) +{ + kmem_cache_free(pid_ns_cachep, + container_of(p, struct pid_namespace, rcu)); +} + static void destroy_pid_namespace(struct pid_namespace *ns) { int i; @@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); - kmem_cache_free(pid_ns_cachep, ns); + call_rcu(&ns->rcu, delayed_free_pidns); } struct pid_namespace *copy_pid_ns(unsigned long flags, diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c7f31aa272f7..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) /* * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, @@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return 0; } +static int posix_cpu_clock_get_task(struct task_struct *tsk, + const clockid_t which_clock, + struct timespec *tp) +{ + int err = -EINVAL; + unsigned long long rtn; + + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(tsk, current)) + err = cpu_clock_sample(which_clock, tsk, &rtn); + } else { + unsigned long flags; + struct sighand_struct *sighand; + + /* + * while_each_thread() is not yet entirely RCU safe, + * keep locking the group while sampling process + * clock for now. + */ + sighand = lock_task_sighand(tsk, &flags); + if (!sighand) + return err; + + if (tsk == current || thread_group_leader(tsk)) + err = cpu_clock_sample_group(which_clock, tsk, &rtn); + + unlock_task_sighand(tsk, &flags); + } + + if (!err) + sample_to_timespec(which_clock, rtn, tp); + + return err; +} + static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); - int error = -EINVAL; - unsigned long long rtn; + int err = -EINVAL; if (pid == 0) { /* * Special case constant value for our own clocks. * We don't have to do any lookup to find ourselves. */ - if (CPUCLOCK_PERTHREAD(which_clock)) { - /* - * Sampling just ourselves we can do with no locking. - */ - error = cpu_clock_sample(which_clock, - current, &rtn); - } else { - read_lock(&tasklist_lock); - error = cpu_clock_sample_group(which_clock, - current, &rtn); - read_unlock(&tasklist_lock); - } + err = posix_cpu_clock_get_task(current, which_clock, tp); } else { /* * Find the given PID, and validate that the caller @@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(pid); - if (p) { - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(p, current)) { - error = cpu_clock_sample(which_clock, - p, &rtn); - } - } else { - read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->sighand) { - error = - cpu_clock_sample_group(which_clock, - p, &rtn); - } - read_unlock(&tasklist_lock); - } - } + if (p) + err = posix_cpu_clock_get_task(p, which_clock, tp); rcu_read_unlock(); } - if (error) - return error; - sample_to_timespec(which_clock, rtn, tp); - return 0; + return err; } @@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) */ static int posix_cpu_timer_del(struct k_itimer *timer) { - struct task_struct *p = timer->it.cpu.task; int ret = 0; + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; - if (likely(p != NULL)) { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { - /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. - */ - BUG_ON(!list_empty(&timer->it.cpu.entry)); - } else { - spin_lock(&p->sighand->siglock); - if (timer->it.cpu.firing) - ret = TIMER_RETRY; - else - list_del(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); - } - read_unlock(&tasklist_lock); + WARN_ON_ONCE(p == NULL); - if (!ret) - put_task_struct(p); + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + + unlock_task_sighand(p, &flags); } + if (!ret) + put_task_struct(p); + return ret; } -static void cleanup_timers_list(struct list_head *head, - unsigned long long curr) +static void cleanup_timers_list(struct list_head *head) { struct cpu_timer_list *timer, *next; @@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head, * time for later timer_gettime calls to return. * This must be called with the siglock held. */ -static void cleanup_timers(struct list_head *head, - cputime_t utime, cputime_t stime, - unsigned long long sum_exec_runtime) +static void cleanup_timers(struct list_head *head) { - - cputime_t ptime = utime + stime; - - cleanup_timers_list(head, cputime_to_expires(ptime)); - cleanup_timers_list(++head, cputime_to_expires(utime)); - cleanup_timers_list(++head, sum_exec_runtime); + cleanup_timers_list(head); + cleanup_timers_list(++head); + cleanup_timers_list(++head); } /* @@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { - cputime_t utime, stime; - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); - task_cputime(tsk, &utime, &stime); - cleanup_timers(tsk->cpu_timers, - utime, stime, tsk->se.sum_exec_runtime); + cleanup_timers(tsk->cpu_timers); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); - cleanup_timers(tsk->signal->cpu_timers, - utime + sig->utime, stime + sig->stime, - tsk->se.sum_exec_runtime + sig->sum_sched_runtime); -} - -static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) -{ - struct cpu_timer_list *timer = &itimer->it.cpu; - - /* - * That's all for this thread or process. - * We leave our residual in expires to be reported. - */ - put_task_struct(timer->task); - timer->task = NULL; - if (timer->expires < now) { - timer->expires = 0; - } else { - timer->expires -= now; - } + cleanup_timers(tsk->signal->cpu_timers); } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp) /* * Insert the timer on the appropriate list before any timers that - * expire later. This must be called with the tasklist_lock held - * for reading, interrupts disabled and p->sighand->siglock taken. + * expire later. This must be called with the sighand lock held. */ static void arm_timer(struct k_itimer *timer) { @@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, @@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); */ static void posix_cpu_timer_kick_nohz(void) { - schedule_work(&nohz_kick_work); + if (context_tracking_is_enabled()) + schedule_work(&nohz_kick_work); } bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) @@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { } * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -static int posix_cpu_timer_set(struct k_itimer *timer, int flags, +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec *new, struct itimerspec *old) { + unsigned long flags; + struct sighand_struct *sighand; struct task_struct *p = timer->it.cpu.task; unsigned long long old_expires, new_expires, old_incr, val; int ret; - if (unlikely(p == NULL)) { - /* - * Timer refers to a dead task's clock. - */ - return -ESRCH; - } + WARN_ON_ONCE(p == NULL); new_expires = timespec_to_sample(timer->it_clock, &new->it_value); - read_lock(&tasklist_lock); /* - * We need the tasklist_lock to protect against reaping that - * clears p->sighand. If p has just been reaped, we can no + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(p->sighand == NULL)) { - read_unlock(&tasklist_lock); - put_task_struct(p); - timer->it.cpu.task = NULL; + if (unlikely(sighand == NULL)) { return -ESRCH; } /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); ret = 0; old_incr = timer->it.cpu.incr; - spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; @@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * disable this firing since we are already reporting * it as an overrun (thanks to bump_cpu_timer above). */ - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); goto out; } - if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { new_expires += val; } @@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, arm_timer(timer); } - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); - + unlock_task_sighand(p, &flags); /* * Install the new reload setting, and * set up the signal and overrun bookkeeping. @@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { unsigned long long now; struct task_struct *p = timer->it.cpu.task; - int clear_dead; + + WARN_ON_ONCE(p == NULL); /* * Easy part: convert the reload time. @@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) return; } - if (unlikely(p == NULL)) { - /* - * This task already died and the timer will never fire. - * In this case, expires is actually the dead value. - */ - dead: - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); - return; - } - /* * Sample the clock to take the difference with the expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &now); - clear_dead = p->exit_state; } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { + struct sighand_struct *sighand; + unsigned long flags; + + /* + * Protect against sighand release/switch in exit/exec and + * also make timer sampling safe if it ends up calling + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. * Call the timer disarmed, nothing else to do. */ - put_task_struct(p); - timer->it.cpu.task = NULL; timer->it.cpu.expires = 0; - read_unlock(&tasklist_lock); - goto dead; + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); } else { cpu_timer_sample_group(timer->it_clock, p, &now); - clear_dead = (unlikely(p->exit_state) && - thread_group_empty(p)); + unlock_task_sighand(p, &flags); } - read_unlock(&tasklist_lock); - } - - if (unlikely(clear_dead)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto dead; } if (now < timer->it.cpu.expires) { @@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk, */ void posix_cpu_timer_schedule(struct k_itimer *timer) { + struct sighand_struct *sighand; + unsigned long flags; struct task_struct *p = timer->it.cpu.task; unsigned long long now; - if (unlikely(p == NULL)) - /* - * The task was cleaned up already, no future firings. - */ - goto out; + WARN_ON_ONCE(p == NULL); /* * Fetch the current sample and update the timer's expiry time. @@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &now); bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) { - clear_dead_task(timer, now); + if (unlikely(p->exit_state)) + goto out; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) goto out; - } - read_lock(&tasklist_lock); /* arm_timer needs it. */ - spin_lock(&p->sighand->siglock); } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { + /* + * Protect arm_timer() and timer sampling in case of call to + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. */ - put_task_struct(p); - timer->it.cpu.task = p = NULL; timer->it.cpu.expires = 0; - goto out_unlock; + goto out; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - cpu_timer_sample_group(timer->it_clock, p, &now); - clear_dead_task(timer, now); - goto out_unlock; + unlock_task_sighand(p, &flags); + /* Optimizations: if the process is dying, no need to rearm */ + goto out; } - spin_lock(&p->sighand->siglock); cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); - /* Leave the tasklist_lock locked for the call below. */ + /* Leave the sighand locked for the call below. */ } /* * Now re-arm for the new expiry time. */ - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); arm_timer(timer); - spin_unlock(&p->sighand->siglock); - -out_unlock: - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + /* Kick full dynticks CPUs in case they need to tick on the new timer */ + posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) struct k_itimer *timer, *next; unsigned long flags; - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - - /* - * In case some timers were rescheduled after the queue got emptied, - * wake up full dynticks CPUs. - */ - if (tsk->signal->cputimer.running) - posix_cpu_timer_kick_nohz(); } /* @@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, { unsigned long long now; - BUG_ON(clock_idx == CPUCLOCK_SCHED); + WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..2fac9cc79b3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 12 + depends on DPM_WATCHDOG + config PM_TRACE bool help diff --git a/kernel/power/console.c b/kernel/power/console.c index 463aa6736751..eacb8bd8cab4 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev) list_for_each_entry(tmp, &pm_vt_switch_list, head) { if (tmp->dev == dev) { list_del(&tmp->head); + kfree(tmp); break; } } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a394297f8b2f..8dff9b48075a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count <= 11) { /* ASCII perhaps? */ - char ascii_value[11]; - unsigned long int ulval; + } else { int ret; - if (copy_from_user(ascii_value, buf, count)) - return -EFAULT; - - if (count > 10) { - if (ascii_value[10] == '\n') - ascii_value[10] = '\0'; - else - return -EINVAL; - } else { - ascii_value[count] = '\0'; - } - ret = kstrtoul(ascii_value, 16, &ulval); - if (ret) { - pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); - return -EINVAL; - } - value = (s32)lower_32_bits(ulval); - } else { - return -EINVAL; + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; } req = filp->private_data; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cff..d9f61a145802 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; @@ -1402,7 +1403,11 @@ int hibernate_preallocate_memory(void) * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = (count - max_size) - pages_highmem; + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; pages = preallocate_image_memory(alloc, avail_normal); if (pages < alloc) { /* We have exhausted non-highmem pages, try highmem. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad1..98d357584cd6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -36,9 +36,9 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - char frozen; - char ready; - char platform_support; + bool frozen; + bool ready; + bool platform_support; bool free_bitmaps; } snapshot_state; @@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; + data->free_bitmaps = false; error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); @@ -93,9 +94,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) atomic_inc(&snapshot_device_available); - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; + data->frozen = false; + data->ready = false; + data->platform_support = false; Unlock: unlock_system_sleep(); @@ -229,7 +230,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (error) thaw_processes(); else - data->frozen = 1; + data->frozen = true; break; @@ -240,7 +241,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); - data->frozen = 0; + data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: @@ -270,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -334,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = 0; + data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b4e8500afdb3..f8b41bddc6dc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = { #ifdef CONFIG_KEXEC /* - * This appends the listed symbols to /proc/vmcoreinfo + * This appends the listed symbols to /proc/vmcore * - * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. @@ -757,14 +757,10 @@ void __init setup_log_buf(int early) return; if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); + new_log_buf = + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); } if (unlikely(!new_log_buf)) { @@ -791,7 +787,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + pr_info("debug: ignoring loglevel setting.\n"); return 0; } @@ -820,9 +816,9 @@ static int __init boot_delay_setup(char *str) pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); - return 1; + return 0; } -__setup("boot_delay=", boot_delay_setup); +early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { @@ -2193,7 +2189,7 @@ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; - printk(KERN_INFO "debug: skip boot console de-registration.\n"); + pr_info("debug: skip boot console de-registration.\n"); return 0; } @@ -2241,7 +2237,7 @@ void register_console(struct console *newcon) /* find the last or real console */ for_each_console(bcon) { if (!(bcon->flags & CON_BOOT)) { - printk(KERN_INFO "Too late to register bootconsole %s%d\n", + pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); return; } @@ -2358,21 +2354,18 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ + pr_info("%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) + /* We need to iterate through all boot consoles, to make + * sure we print everything out, before we unregister them. */ - printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", - newcon->name, newcon->index); for_each_console(bcon) if (bcon->flags & CON_BOOT) unregister_console(bcon); - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); } } EXPORT_SYMBOL(register_console); @@ -2382,6 +2375,10 @@ int unregister_console(struct console *console) struct console *a, *b; int res; + pr_info("%sconsole [%s%d] disabled\n", + (console->flags & CON_BOOT) ? "boot" : "" , + console->name, console->index); + res = _braille_unregister_console(console); if (res) return res; @@ -2421,8 +2418,6 @@ static int __init printk_late_init(void) for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - printk(KERN_INFO "turn off boot console %s%d\n", - con->name, con->index); unregister_console(con); } } @@ -2449,7 +2444,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) if (pending & PRINTK_PENDING_SCHED) { char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); + pr_warn("[sched_delayed] %s", buf); } if (pending & PRINTK_PENDING_WAKEUP) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dd562e9aa2c8..1f4bcb3cc21c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -257,7 +257,8 @@ ok: if (task->mm) dumpable = get_dumpable(task->mm); rcu_read_lock(); - if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + if (dumpable != SUID_DUMP_USER && + !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { rcu_read_unlock(); return -EPERM; } diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile @@ -0,0 +1,6 @@ +obj-y += update.o srcu.o +obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_TREE_RCU) += tree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o +obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h index 77131966c4ad..79c3877e9c5b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu/rcu.h @@ -96,19 +96,22 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -extern void kfree(const void *); +void kfree(const void *); static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; + rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + rcu_lock_release(&rcu_callback_map); return 0; } } @@ -122,4 +125,11 @@ int rcu_jiffies_till_stall_check(void); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..3318d8284384 100644 --- a/kernel/srcu.c +++ b/kernel/rcu/srcu.c @@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp) /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)) @@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. */ void synchronize_srcu(struct srcu_struct *sp) { @@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is also illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; - * doing so will result in deadlock. However, it is perfectly legal - * to call synchronize_srcu_expedited() on one srcu_struct from some - * other srcu_struct's read-side critical section, as long as - * the resulting graph of srcu_structs is acyclic. + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). */ void synchronize_srcu_expedited(struct srcu_struct *sp) { @@ -491,6 +533,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. */ void srcu_barrier(struct srcu_struct *sp) { diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c index 9ed6075dc562..1254f312d024 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c @@ -35,6 +35,7 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> +#include <linux/ftrace_event.h> #ifdef CONFIG_RCU_TRACE #include <trace/events/rcu.h> @@ -42,7 +43,7 @@ #include "rcu.h" -/* Forward declarations for rcutiny_plugin.h. */ +/* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); @@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; -#include "rcutiny_plugin.h" +#include "tiny_plugin.h" /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { if (newval) { - RCU_TRACE(trace_rcu_dyntick("--=", + RCU_TRACE(trace_rcu_dyntick(TPS("--="), rcu_dynticks_nesting, newval)); rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); + RCU_TRACE(trace_rcu_dyntick(TPS("Start"), + rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", + RCU_TRACE(trace_rcu_dyntick(TPS("++="), oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -174,18 +176,18 @@ void rcu_irq_enter(void) } EXPORT_SYMBOL_GPL(rcu_irq_enter); -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) /* * Test whether RCU thinks that the current CPU is idle. */ -int rcu_is_cpu_idle(void) +bool notrace __rcu_is_watching(void) { - return !rcu_dynticks_nesting; + return rcu_dynticks_nesting; } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL(__rcu_is_watching); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* * Test whether the current CPU was interrupted from idle. Nested @@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - ACCESS_ONCE(rcp->rcucblist), + !!ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), false)); @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + RCU_TRACE(trace_rcu_batch_end(rcp->name, + cb_count, 0, need_resched(), is_idle_task(current), false)); } diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c index be63101c6175..732f8ae3086a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c @@ -52,6 +52,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." + static int fqs_duration; module_param(fqs_duration, int, 0444); MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); @@ -133,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define VERBOSE_PRINTK_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) -static char printk_buf[4096]; - static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; @@ -370,7 +374,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - int (*stats)(char *page); + void (*stats)(char *page); int irq_capable; int can_boost; const char *name; @@ -572,21 +576,19 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static int srcu_torture_stats(char *page) +static void srcu_torture_stats(char *page) { - int cnt = 0; int cpu; int idx = srcu_ctl.completed & 0x1; - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, + page += sprintf(page, " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } - cnt += sprintf(&page[cnt], "\n"); - return cnt; + sprintf(page, "\n"); } static void srcu_torture_synchronize_expedited(void) @@ -1046,10 +1048,9 @@ rcu_torture_reader(void *arg) /* * Create an RCU-torture statistics message in the specified buffer. */ -static int +static void rcu_torture_printk(char *page) { - int cnt = 0; int cpu; int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; @@ -1065,8 +1066,8 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, @@ -1074,53 +1075,52 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", + page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); - cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", + page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], + page += sprintf(page, "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", n_online_successes, n_online_attempts, n_offline_successes, n_offline_attempts, min_online, max_online, min_offline, max_offline, sum_online, sum_offline, HZ); - cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", + page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - cnt += sprintf(&page[cnt], "!!! "); + page += sprintf(page, "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - cnt += sprintf(&page[cnt], "Reader Pipe: "); + page += sprintf(page, "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); + page += sprintf(page, " %ld", pipesummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + page += sprintf(page, " %ld", batchsummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", + page += sprintf(page, " %d", atomic_read(&rcu_torture_wcount[i])); } - cnt += sprintf(&page[cnt], "\n"); + page += sprintf(page, "\n"); if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; + cur_ops->stats(page); } /* @@ -1134,10 +1134,17 @@ rcu_torture_printk(char *page) static void rcu_torture_stats_print(void) { - int cnt; + int size = nr_cpu_ids * 200 + 8192; + char *buf; - cnt = rcu_torture_printk(printk_buf); - pr_alert("%s", printk_buf); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("rcu-torture: Out of memory, need: %d", size); + return; + } + rcu_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); } /* @@ -1572,6 +1579,7 @@ static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; bool lastphase = 0; + bool newphase; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1579,10 +1587,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - barrier_phase != lastphase || + (newphase = + ACCESS_ONCE(barrier_phase)) != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); - lastphase = barrier_phase; + lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; @@ -1619,7 +1628,7 @@ static int rcu_torture_barrier(void *arg) if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; n_barrier_attempts++; - cur_ops->cb_barrier(); + cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; WARN_ON_ONCE(1); diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c index 32618b3fe4e6..b3d116cd072d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c @@ -41,6 +41,7 @@ #include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> +#include <linux/module.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -56,17 +57,16 @@ #include <linux/ftrace_event.h> #include <linux/suspend.h> -#include "rcutree.h" +#include "tree.h" #include <trace/events/rcu.h> #include "rcu.h" -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x) tracepoint_string(x) +MODULE_ALIAS("rcutree"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE @@ -369,9 +369,13 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { + struct rcu_state *rsp; + struct rcu_data *rdp; + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); @@ -379,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + do_nocb_deferred_wakeup(rdp); + } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -407,14 +415,15 @@ static void rcu_eqs_enter(bool user) long long oldval; struct rcu_dynticks *rdtp; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - else + rcu_eqs_enter_common(rdtp, oldval, user); + } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); + } } /** @@ -435,7 +444,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -478,7 +487,7 @@ void rcu_irq_exit(void) struct rcu_dynticks *rdtp; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); @@ -508,7 +517,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); @@ -528,14 +538,15 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) + if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else + } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(rdtp, oldval, user); + } } /** @@ -555,7 +566,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -599,7 +610,7 @@ void rcu_irq_enter(void) long long oldval; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); @@ -620,7 +631,7 @@ void rcu_irq_enter(void) */ void rcu_nmi_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 && (atomic_read(&rdtp->dynticks) & 0x1)) @@ -642,7 +653,7 @@ void rcu_nmi_enter(void) */ void rcu_nmi_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 || --rdtp->dynticks_nmi_nesting != 0) @@ -655,21 +666,34 @@ void rcu_nmi_exit(void) } /** - * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle + * __rcu_is_watching - are RCU read-side critical sections safe? + * + * Return true if RCU is watching the running CPU, which means that + * this CPU can safely enter RCU read-side critical sections. Unlike + * rcu_is_watching(), the caller of __rcu_is_watching() must have at + * least disabled preemption. + */ +bool notrace __rcu_is_watching(void) +{ + return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; +} + +/** + * rcu_is_watching - see if RCU thinks that the current CPU is idle * * If the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ -int rcu_is_cpu_idle(void) +bool notrace rcu_is_watching(void) { int ret; preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + ret = __rcu_is_watching(); preempt_enable(); return ret; } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL_GPL(rcu_is_watching); #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) @@ -701,9 +725,9 @@ bool rcu_lockdep_current_cpu_online(void) bool ret; if (in_nmi()) - return 1; + return true; preempt_disable(); - rdp = &__get_cpu_var(rcu_sched_data); + rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; @@ -723,7 +747,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } /* @@ -740,6 +764,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -797,13 +827,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rcu_kick_nohz_cpu(rdp->cpu); + /* + * Alternatively, the CPU might be running in the kernel + * for an extended period of time without a quiescent state. + * Attempt to force the CPU through the scheduler to gain the + * needed quiescent state, but only if the grace period has gone + * on for an uncommonly long time. If there are many stuck CPUs, + * we will beat on the first one until it gets unstuck, then move + * to the next. Only do this for the primary flavor of RCU. + */ + if (rdp->rsp == rcu_state && + ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { + rdp->rsp->jiffies_resched += 5; + resched_cpu(rdp->cpu); + } + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { - rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j1; + + rsp->gp_start = j; + smp_wmb(); /* Record start time before stall time. */ + j1 = rcu_jiffies_till_stall_check(); + rsp->jiffies_stall = j + j1; + rsp->jiffies_resched = j + j1 / 2; } /* @@ -898,6 +949,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -927,22 +984,60 @@ static void print_cpu_stall(struct rcu_state *rsp) 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + resched_cpu(smp_processor_id()); } static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { + unsigned long completed; + unsigned long gpnum; + unsigned long gps; unsigned long j; unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress) + if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; j = ACCESS_ONCE(jiffies); + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, + * then rsp->gp_start, and finally rsp->completed. These values + * are updated in the opposite order with memory barriers (or + * equivalent) during grace-period initialization and cleanup. + * Now, a false positive can occur if we get an new value of + * rsp->gp_start and a old value of rsp->jiffies_stall. But given + * the memory barriers, the only way that this can happen is if one + * grace period ends and another starts between these two fetches. + * Detect this by comparing rsp->completed with the previous fetch + * from rsp->gpnum. + * + * Given this check, comparisons of jiffies, rsp->jiffies_stall, + * and rsp->gp_start suffice to forestall false positives. + */ + gpnum = ACCESS_ONCE(rsp->gpnum); + smp_rmb(); /* Pick up ->gpnum first... */ js = ACCESS_ONCE(rsp->jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = ACCESS_ONCE(rsp->gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->completed. */ + completed = ACCESS_ONCE(rsp->completed); + if (ULONG_CMP_GE(completed, gpnum) || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1071,8 +1166,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1292,12 +1389,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* - * Initialize a new grace period. + * Initialize a new grace period. Return 0 if no grace period required. */ static int rcu_gp_init(struct rcu_state *rsp) { @@ -1306,18 +1404,28 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); + if (rsp->gp_flags == 0) { + /* Spurious wakeup, tell caller to go back to sleep. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - if (rcu_gp_in_progress(rsp)) { - /* Grace period already in progress, don't start another. */ + if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { + /* + * Grace period already in progress, don't start another. + * Not supposed to be able to happen. + */ raw_spin_unlock_irq(&rnp->lock); return 0; } /* Advance to a new grace period and initialize state. */ + record_gp_stall_check_time(rsp); + smp_wmb(); /* Record GP times before starting GP. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ @@ -1338,6 +1446,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1366,7 +1475,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* * Do one round of quiescent-state forcing. */ -int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; bool isidle = false; @@ -1392,6 +1501,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1409,6 +1519,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1434,16 +1545,19 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __note_gp_changes(rsp, rnp, rdp); + /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ @@ -1451,8 +1565,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; + if (cpu_needs_another_gp(rsp, rdp)) { + rsp->gp_flags = RCU_GP_FLAG_INIT; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); + } raw_spin_unlock_irq(&rnp->lock); } @@ -1462,6 +1580,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) static int __noreturn rcu_gp_kthread(void *arg) { int fqs_state; + int gf; unsigned long j; int ret; struct rcu_state *rsp = arg; @@ -1471,14 +1590,20 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwait")); wait_event_interruptible(rsp->gp_wq, - rsp->gp_flags & + ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); - if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && - rcu_gp_init(rsp)) + /* Locking provides needed memory barrier. */ + if (rcu_gp_init(rsp)) break; cond_resched(); flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ @@ -1488,25 +1613,42 @@ static int __noreturn rcu_gp_kthread(void *arg) j = HZ; jiffies_till_first_fqs = HZ; } + ret = 0; for (;;) { - rsp->jiffies_force_qs = jiffies + j; + if (!ret) + rsp->jiffies_force_qs = jiffies + j; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswait")); ret = wait_event_interruptible_timeout(rsp->gp_wq, - (rsp->gp_flags & RCU_GP_FLAG_FQS) || + ((gf = ACCESS_ONCE(rsp->gp_flags)) & + RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || + (gf & RCU_GP_FLAG_FQS)) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsstart")); fqs_state = rcu_gp_fqs(rsp, fqs_state); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsend")); cond_resched(); } else { /* Deal with stray signal. */ cond_resched(); flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswaitsig")); } j = jiffies_till_next_fqs; if (j > HZ) { @@ -1554,6 +1696,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, return; } rsp->gp_flags = RCU_GP_FLAG_INIT; + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); /* * We can't do wakeups while holding the rnp->lock, as that @@ -1650,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1679,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1802,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Adopt the RCU callbacks from the specified rcu_state structure's * orphanage. The caller must hold the ->orphan_lock. */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -1887,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); + rcu_adopt_orphan_cbs(rsp, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2103,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2132,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2164,6 +2313,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2204,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_callbacks(rsp, rdp); + + /* Do any needed deferred wakeups of rcuo kthreads. */ + do_nocb_deferred_wakeup(rdp); } /* @@ -2255,7 +2408,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) + if (!rcu_is_watching() && cpu_online(smp_processor_id())) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ @@ -2279,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { @@ -2338,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); + offline = !__call_rcu_nocb(rdp, head, lazy, flags); WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ local_irq_restore(flags); @@ -2658,6 +2812,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ + if (rcu_nohz_full_cpu(rsp)) + return 0; + /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -2691,6 +2849,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) { + rdp->n_rp_nocb_defer_wakeup++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2725,10 +2889,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->qlen != rdp->qlen_lazy) + if (!rdp->nxtlist) + continue; + hc = true; + if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { al = false; - if (rdp->nxtlist) - hc = true; + break; + } } if (all_lazy) *all_lazy = al; @@ -3112,9 +3279,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -3216,7 +3383,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* * Compute the rcu_node tree geometry from kernel parameters. This cannot - * replace the definitions in rcutree.h because those are needed to size + * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ static void __init rcu_init_geometry(void) @@ -3244,6 +3411,8 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + rcu_fanout_leaf, nr_cpu_ids); /* * Compute number of nodes that can be handled an rcu_node tree @@ -3295,8 +3464,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); @@ -3311,4 +3480,4 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } -#include "rcutree_plugin.h" +#include "tree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h index 5f97eab602cd..8c19873f1ac9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcu/tree.h @@ -104,6 +104,8 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; + /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; @@ -315,6 +317,7 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; + unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; /* 6) _rcu_barrier() and OOM callbacks. */ @@ -333,6 +336,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -451,6 +455,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + unsigned long jiffies_resched; /* Time at which to resched */ + /* a reluctant CPU. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -546,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy); + bool lazy, unsigned long flags); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp); + struct rcu_data *rdp, + unsigned long flags); +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); @@ -562,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); +static bool rcu_nohz_full_cpu(struct rcu_state *rsp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h index 130c97b027f2..6e2ef4b2b920 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,7 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> -#include "time/tick-internal.h" +#include "../time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_setall(rcu_nocb_mask); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -199,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu) rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -307,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -356,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { rcu_preempt_qs(smp_processor_id()); + if (!t->rcu_read_unlock_special) { + local_irq_restore(flags); + return; + } } - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { + /* Hardware IRQ handlers cannot block, complain if they get here. */ + if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { local_irq_restore(flags); return; } @@ -376,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t) for (;;) { rnp = t->rcu_blocked_node; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -600,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, while (!list_empty(lp)) { t = list_entry(lp->next, typeof(*t), rcu_node_entry); raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); list_del(&t->rcu_node_entry); t->rcu_blocked_node = rnp_root; list_add(&t->rcu_node_entry, lp_root); @@ -624,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * in this case. */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); if (rnp_root->boost_tasks != NULL && rnp_root->boost_tasks != rnp_root->gp_tasks && rnp_root->boost_tasks != rnp_root->exp_tasks) @@ -660,7 +674,7 @@ static void rcu_preempt_check_callbacks(int cpu) static void rcu_preempt_do_callbacks(void) { - rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -767,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -774,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ wake_up(&sync_rcu_preempt_exp_wq); + } break; } mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); rnp->expmask &= ~mask; } } @@ -801,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -881,6 +900,7 @@ void synchronize_rcu_expedited(void) /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->expmask = rnp->qsmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1128,7 +1148,7 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "rtmutex_common.h" +#include "../locking/rtmutex_common.h" #ifdef CONFIG_RCU_TRACE @@ -1186,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* * Recheck under the lock: all tasks in need of boosting @@ -1332,7 +1353,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __get_cpu_var(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1372,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_BOOST_PRIO; @@ -1382,8 +1404,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); rcu_preempt_do_callbacks(); } @@ -1402,7 +1424,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __get_cpu_var(rcu_cpu_has_work); + return __this_cpu_read(rcu_cpu_has_work); } /* @@ -1412,8 +1434,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); - char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { @@ -1627,20 +1649,26 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* - * Try to advance callbacks for all flavors of RCU on the current CPU. - * Afterwards, if there are any callbacks ready for immediate invocation, - * return true. + * Try to advance callbacks for all flavors of RCU on the current CPU, but + * only if it has been awhile since the last time we did so. Afterwards, + * if there are any callbacks ready for immediate invocation, return true. */ static bool rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; + /* Exit early if we advanced recently. */ + if (jiffies == rdtp->last_advance_all) + return 0; + rdtp->last_advance_all = jiffies; + for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); rnp = rdp->mynode; @@ -1718,7 +1746,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ @@ -1739,6 +1767,8 @@ static void rcu_prepare_for_idle(int cpu) */ if (rdtp->all_lazy && rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + rdtp->all_lazy = false; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1756,6 +1786,7 @@ static void rcu_prepare_for_idle(int cpu) continue; rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -1768,17 +1799,11 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - struct rcu_data *rdp; - struct rcu_state *rsp; if (rcu_is_nocb_cpu(cpu)) return; - rcu_try_advance_all_cbs(); - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_core(); - } + if (rcu_try_advance_all_cbs()) + invoke_rcu_core(); } /* @@ -1845,6 +1870,7 @@ static int rcu_oom_notify(struct notifier_block *self, /* Wait for callbacks from earlier instance to complete. */ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + smp_mb(); /* Ensure callback reuse happens after callback invocation. */ /* * Prevent premature wakeup: ensure that all increments happen @@ -2094,7 +2120,8 @@ bool rcu_is_nocb_cpu(int cpu) static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct rcu_head *rhp, struct rcu_head **rhtp, - int rhcount, int rhcount_lazy) + int rhcount, int rhcount_lazy, + unsigned long flags) { int len; struct rcu_head **old_rhpp; @@ -2108,15 +2135,29 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll | !t) + if (rcu_nocb_poll || !t) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeNotPoll")); return; + } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { - wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + if (!irqs_disabled_flags(flags)) { + wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rdp->nocb_defer_wakeup = true; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmptyIsDeferred")); + } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); + } else { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } return; } @@ -2131,19 +2172,21 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, * "rcuo" kthread can find it. */ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, - rdp->qlen_lazy, rdp->qlen); + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); else trace_rcu_callback(rdp->rsp->name, rhp, - rdp->qlen_lazy, rdp->qlen); + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); return 1; } @@ -2152,7 +2195,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * not a no-CBs CPU. */ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { long ql = rsp->qlen; long qll = rsp->qlen_lazy; @@ -2166,14 +2210,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* First, enqueue the donelist, if any. This preserves CB ordering. */ if (rsp->orphan_donelist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll); + rsp->orphan_donetail, ql, qll, flags); ql = qll = 0; rsp->orphan_donelist = NULL; rsp->orphan_donetail = &rsp->orphan_donelist; } if (rsp->orphan_nxtlist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll); + rsp->orphan_nxttail, ql, qll, flags); ql = qll = 0; rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; @@ -2193,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); c = rcu_start_future_gp(rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -2221,6 +2266,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2229,14 +2275,28 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Sleep")); wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + /* Memory barrier provide by xchg() below. */ + } else if (firsttime) { + firsttime = 0; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Poll")); + } list = ACCESS_ONCE(rdp->nocb_head); if (!list) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeEmpty")); schedule_timeout_interruptible(1); flush_signals(current); continue; } + firsttime = 1; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeNonEmpty")); /* * Extract queued callbacks, update counts, and wait @@ -2257,7 +2317,11 @@ static int rcu_nocb_kthread(void *arg) next = list->next; /* Wait for enqueuing to complete, if needed. */ while (next == NULL && &list->next != tail) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WaitQueue")); schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeQueue")); next = list->next; } debug_rcu_head_unqueue(list); @@ -2276,6 +2340,22 @@ static int rcu_nocb_kthread(void *arg) return 0; } +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return ACCESS_ONCE(rdp->nocb_defer_wakeup); +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (!rcu_nocb_need_deferred_wakeup(rdp)) + return; + ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; + wake_up(&rdp->nocb_wq); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); +} + /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { @@ -2331,13 +2411,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { return 0; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { return 0; } @@ -2346,6 +2427,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ +} + static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } @@ -2795,3 +2885,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +/* + * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the + * grace-period kthread will do force_quiescent_state() processing? + * The idea is to avoid waking up RCU core processing on such a + * CPU unless the grace period has extended for too long. + * + * This code relies on the fact that all NO_HZ_FULL CPUs are also + * CONFIG_RCU_NOCB_CPUs. + */ +static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(smp_processor_id()) && + (!rcu_gp_in_progress(rsp) || + ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) + return 1; +#endif /* #ifdef CONFIG_NO_HZ_FULL */ + return 0; +} diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c index cf6c17412932..4def475336d4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -44,7 +44,7 @@ #include <linux/seq_file.h> #define RCU_TREE_NONCORE -#include "rcutree.h" +#include "tree.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) @@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, + rdp->n_rp_nocb_defer_wakeup, rdp->n_rp_need_nothing); } diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c index b02a339836b4..802365ccd591 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c @@ -53,6 +53,12 @@ #include "rcu.h" +MODULE_ALIAS("rcupdate"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcupdate." + module_param(rcu_expedited, int, 0); #ifdef CONFIG_PREEMPT_RCU @@ -122,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +static struct lock_class_key rcu_callback_key; +struct lockdep_map rcu_callback_map = + STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); +EXPORT_SYMBOL_GPL(rcu_callback_map); + int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active && debug_locks && @@ -148,7 +159,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; @@ -298,7 +309,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); diff --git a/kernel/reboot.c b/kernel/reboot.c index f813b3474646..662c83fc16b7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -static void migrate_to_reboot_cpu(void) +void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ int cpu = reboot_cpu; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..9a95c8c2af2a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,8 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o -obj-$(CONFIG_SMP) += cpupri.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += wait.o completion.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..6bd6a6731b21 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) * local_clock() -- is cpu_clock() on the current cpu. * + * sched_clock_cpu(i) + * * How: * * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -66,6 +58,8 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/static_key.h> +#include <linux/workqueue.h> /* * Scheduler clock - returns current time in nanosec units. @@ -82,7 +76,37 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; + +int sched_clock_stable(void) +{ + if (static_key_false(&__sched_clock_stable)) + return false; + return true; +} + +void set_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + static_key_slow_dec(&__sched_clock_stable); +} + +static void __clear_sched_clock_stable(struct work_struct *work) +{ + /* XXX worry about clock continuity */ + if (sched_clock_stable()) + static_key_slow_inc(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ + if (keventd_up()) + schedule_work(&sched_clock_work); + else + __clear_sched_clock_stable(&sched_clock_work); +} struct sched_clock_data { u64 tick_raw; @@ -242,20 +266,20 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); - - if (sched_clock_stable) + if (sched_clock_stable()) return sched_clock(); if (unlikely(!sched_clock_running)) return 0ull; + preempt_disable(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable(); return clock; } @@ -265,7 +289,7 @@ void sched_clock_tick(void) struct sched_clock_data *scd; u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; if (unlikely(!sched_clock_running)) @@ -316,14 +340,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(cpu); - return clock; + return sched_clock(); } /* @@ -335,14 +355,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - u64 clock; - unsigned long flags; + if (static_key_false(&__sched_clock_stable)) + return sched_clock_cpu(raw_smp_processor_id()); - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); - - return clock; + return sched_clock(); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -362,12 +378,12 @@ u64 sched_clock_cpu(int cpu) u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + return sched_clock(); } u64 local_clock(void) { - return sched_clock_cpu(0); + return sched_clock(); } #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 000000000000..a63f4dc27909 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include <linux/sched.h> +#include <linux/completion.h> + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_locked(&x->wait, TASK_NORMAL, 1); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_locked(&x->wait, TASK_NORMAL, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Return: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Return: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..4d6964e49711 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -296,8 +296,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; - - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -513,12 +511,11 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -526,8 +523,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -546,6 +545,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +693,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +761,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -903,7 +897,9 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + prio = MAX_DL_PRIO-1; + else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -949,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) + } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1017,6 +1013,108 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ + if (p->on_rq) { + struct rq *src_rq, *dst_rq; + + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); + } else { + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; + } +} + +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; + + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); + + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); + double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; + + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; + + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; + + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; + + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); + + ret = 0; + +unlock: + double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); + + return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ + struct migration_swap_arg arg; + int ret = -EINVAL; + + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; + + if (arg.src_cpu == arg.dst_cpu) + goto out; + + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; + + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; + + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; + + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: + return ret; +} + struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1236,9 +1334,9 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1428,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 max = 2*rq->max_idle_balance_cost; - if (delta > max) + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif @@ -1396,6 +1495,13 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1611,6 +1717,13 @@ static void __sched_fork(struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif + RB_CLEAR_NODE(&p->dl.rb_node); + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + p->dl.dl_runtime = p->dl.runtime = 0; + p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.dl_period = 0; + p->dl.flags = 0; + INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1619,16 +1732,24 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies; - p->mm->numa_next_reset = jiffies; + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); p->mm->numa_scan_seq = 0; } + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1654,12 +1775,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -1676,7 +1797,7 @@ void sched_fork(struct task_struct *p) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; @@ -1693,8 +1814,14 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { p->sched_class = &fair_sched_class; + } if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -1717,18 +1844,128 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif + init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu(); + return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } /* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + +/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -1747,7 +1984,7 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif /* Initialize new task's runnable average */ @@ -1838,7 +2075,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); @@ -1890,6 +2127,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + task_numa_free(prev); + + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2073,7 +2315,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2140,6 +2382,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + /* + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + */ + if (!p->on_cpu) + return p->se.sum_exec_runtime; +#endif + rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); @@ -2169,7 +2425,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + trigger_load_balance(rq); #endif rq_last_tick_reset(rq); } @@ -2215,7 +2471,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2224,7 +2480,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2235,9 +2491,9 @@ void __kprobes add_preempt_count(int val) if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void __kprobes preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2255,9 +2511,9 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + __preempt_count_sub(val); } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); #endif @@ -2287,10 +2543,10 @@ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. + * schedule() atomically, we ignore that path. Otherwise whine + * if we are scheduling when we should not. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) __schedule_bug(prev); rcu_sleep_check(); @@ -2430,6 +2686,7 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2520,9 +2777,9 @@ asmlinkage void __sched notrace preempt_schedule(void) return; do { - add_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2532,6 +2789,7 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2541,20 +2799,19 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); do { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2566,8 +2823,6 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -#endif /* CONFIG_PREEMPT */ - int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { @@ -2575,393 +2830,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ - __wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = WF_SYNC; - - if (unlikely(!q)) - return; - - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = action(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ - wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Return: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Return: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); - static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -3022,11 +2890,11 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; + int oldprio, on_rq, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; - BUG_ON(prio < 0 || prio > MAX_PRIO); + BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); @@ -3049,6 +2917,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); + p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -3058,23 +2927,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (rt_prio(prio)) + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || (p->pi_top_task && + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + p->dl.dl_throttled = 0; + enqueue_flag = ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; + p->sched_class = &dl_sched_class; + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag = ENQUEUE_HEAD; p->sched_class = &rt_sched_class; - else + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; p->sched_class = &fair_sched_class; + } p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: __task_rq_unlock(rq); } #endif + void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -3092,9 +2987,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -3249,22 +3144,95 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + init_dl_task_timer(dl_se); + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) { + int policy = attr->sched_policy; + + if (policy == -1) /* setparam */ + policy = p->policy; + p->policy = policy; - p->rt_priority = prio; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; + set_load_weight(p); } +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ + return attr && attr->sched_deadline != 0 && + (attr->sched_period == 0 || + (s64)(attr->sched_period - attr->sched_deadline) >= 0) && + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && + attr->sched_runtime >= (2 << (DL_SCALE - 1)); +} + /* * check the target process has a UID that matches the current process's */ @@ -3281,10 +3249,12 @@ static bool check_same_owner(struct task_struct *p) return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; + int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -3298,31 +3268,40 @@ recheck: reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_FIFO && policy != SCHED_RR && + if (policy != SCHED_DEADLINE && + policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; } + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (attr->sched_nice < TASK_NICE(p) && + !can_nice(p, attr->sched_nice)) + return -EPERM; + } + if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -3332,8 +3311,8 @@ recheck: return -EPERM; /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) return -EPERM; } @@ -3381,14 +3360,21 @@ recheck: /* * If not changing anything there's no need to proceed further: */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy)) + goto change; + task_rq_unlock(rq, p, &flags); return 0; } +change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -3399,8 +3385,24 @@ recheck: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3408,6 +3410,17 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3419,7 +3432,7 @@ recheck: oldprio = p->prio; prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); + __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); @@ -3434,6 +3447,26 @@ recheck: return 0; } +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* + * Fixup the legacy SCHED_RESET_ON_FORK hack + */ + if (policy & SCHED_RESET_ON_FORK) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check); +} /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. @@ -3447,10 +3480,16 @@ recheck: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return _sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -3467,7 +3506,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return _sched_setscheduler(p, policy, param, false); } static int @@ -3492,6 +3531,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, -20, 19); + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -3523,6 +3635,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) } /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + */ +SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0) + return -EINVAL; + + if (sched_copy_attr(uattr, &attr)) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + +/** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. * @@ -3577,6 +3716,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; + if (task_has_dl_policy(p)) { + retval = -EINVAL; + goto out_unlock; + } lp.sched_priority = p->rt_priority; rcu_read_unlock(); @@ -3592,19 +3735,107 @@ out_unlock: return retval; } +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + goto err_size; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, usize); + if (ret) + return -EFAULT; + +out: + return ret; + +err_size: + ret = -E2BIG; + goto out; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + */ +SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = TASK_NICE(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); return -ESRCH; } @@ -3638,8 +3869,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif again: retval = set_cpus_allowed_ptr(p, new_mask); @@ -3661,7 +3910,6 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -3706,7 +3954,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -3719,12 +3966,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -3794,16 +4040,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -3924,7 +4165,7 @@ again: } double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { + if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } @@ -4013,6 +4254,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -4039,6 +4281,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_RR: ret = 1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -4186,7 +4429,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(idle); + __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -4212,7 +4455,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + init_idle_preempt_count(idle, cpu); /* * The idle tasks have their own, simple scheduling class: @@ -4346,6 +4589,54 @@ fail: return ret; } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ + struct migration_arg arg = { p, target_cpu }; + int curr_cpu = task_cpu(p); + + if (curr_cpu == target_cpu) + return 0; + + if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + return -EINVAL; + + /* TODO: This is not properly updating schedstats */ + + trace_sched_move_numa(p, curr_cpu, target_cpu); + return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ + struct rq *rq; + unsigned long flags; + bool on_rq, running; + + rq = task_rq_lock(p, &flags); + on_rq = p->on_rq; + running = task_current(rq, p); + + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->numa_preferred_nid = nid; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &flags); +} +#endif + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -4738,13 +5029,31 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned long flags; + long cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); + set_cpu_active(cpu, false); + + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + struct dl_bw *dl_b = dl_bw_of(cpu); + bool overflow; + int cpus; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (overflow) + return notifier_from_errno(-EBUSY); + } return NOTIFY_OK; - default: - return NOTIFY_DONE; } + + return NOTIFY_DONE; } static int __init migration_init(void) @@ -4963,6 +5272,8 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); + free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -4985,7 +5296,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -5014,8 +5325,14 @@ static int init_rootdomain(struct root_domain *rd) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; + + init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; @@ -5023,6 +5340,8 @@ static int init_rootdomain(struct root_domain *rd) free_rto_mask: free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: @@ -5119,10 +5438,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; + struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5130,11 +5453,19 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + busy_sd = sd->parent; /* sd_busy */ } + rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + + sd = highest_flag_domain(cpu, SD_ASYM_PACKING); + rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); } /* @@ -5325,6 +5656,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgp->power_orig = sg->sgp->power; /* * Make sure the first group of this domain contains the @@ -5654,6 +5986,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING + | 1*SD_NUMA | sd_local_flags(level) , .last_balance = jiffies, @@ -6335,14 +6668,17 @@ void __init sched_init_smp(void) sched_init_numa(); - get_online_cpus(); + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6357,6 +6693,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); + init_sched_dl_class(); } #else void __init sched_init_smp(void) @@ -6426,13 +6763,15 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_rt_period(), global_rt_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6456,6 +6795,7 @@ void __init sched_init(void) rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); + init_dl_rq(&rq->dl, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6505,6 +6845,7 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -6526,10 +6867,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -6603,13 +6940,16 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { const struct sched_class *prev_class = p->sched_class; + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + }; int old_prio = p->prio; int on_rq; on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); + __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); resched_task(rq->curr); @@ -6639,7 +6979,7 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!rt_task(p)) { + if (!dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -6834,16 +7174,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -7017,24 +7347,13 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; int ret = 0; - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -7057,17 +7376,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; + int i, ret = 0; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -7079,36 +7388,88 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; + return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int sched_dl_global_constraints(void) { - int ret; - static DEFINE_MUTEX(mutex); + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + int cpu, ret = 0; - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* make sure that internally we keep jiffies */ - /* also, writing zero resets timeslice to default */ - if (!ret && write) { - sched_rr_timeslice = sched_rr_timeslice <= 0 ? - RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock(&dl_b->lock); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock(&dl_b->lock); + + if (ret) + break; } - mutex_unlock(&mutex); + return ret; } +static void sched_dl_do_global(void) +{ + u64 new_bw = -1; + int cpu; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock(&dl_b->lock); + dl_b->bw = new_bw; + raw_spin_unlock(&dl_b->lock); + } +} + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; int old_period, old_runtime; static DEFINE_MUTEX(mutex); + int ret; mutex_lock(&mutex); old_period = sysctl_sched_rt_period; @@ -7117,21 +7478,50 @@ int sched_rt_handler(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } + if (ret) + goto undo; + + ret = sched_dl_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; } mutex_unlock(&mutex); return ret; } +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -7277,7 +7667,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + /* + * If we need to toggle cfs_bandwidth_used, off->on must occur + * before making related changes, and on->off must occur afterwards + */ + if (runtime_enabled && !runtime_was_enabled) + cfs_bandwidth_usage_inc(); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7303,6 +7698,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } + if (runtime_was_enabled && !runtime_enabled) + cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); @@ -7457,15 +7854,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = css_tg(css); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7519,7 +7915,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff0299..622e0818f905 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -163,10 +163,9 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; @@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct seq_file *sf, void *v) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(sf)); int cpu; s64 val = 0; @@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, val += kcpustat->cpustat[CPUTIME_NICE]; } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; for_each_online_cpu(cpu) { @@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); return 0; } @@ -220,11 +218,11 @@ static struct cftype files[] = { }, { .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, + .seq_show = cpuacct_percpu_seq_show, }, { .name = "stat", - .read_map = cpuacct_stats_show, + .seq_show = cpuacct_stats_show, }, { } /* terminate */ }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..045fc74e3f09 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,216 @@ +/* + * kernel/sched/cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli <j.lelli@sssup.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gfp.h> +#include <linux/kernel.h> +#include "cpudeadline.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a], cp->elements[b]); + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(cpu > num_present_cpus()); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->cpu_to_idx[cpu]; + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->cpu_to_idx[new_cpu] = old_idx; + cp->cpu_to_idx[cpu] = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->cpu_to_idx[cpu] = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + for (i = 0; i < NR_CPUS; i++) + cp->cpu_to_idx[i] = IDX_INVALID; + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + /* + * nothing to do for the moment + */ +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include <linux/sched.h> + +#define IDX_INVALID -1 + +struct array_item { + u64 dl; + int cpu; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + int cpu_to_idx[NR_CPUS]; + struct array_item elements[NR_CPUS]; + cpumask_var_t free_cpus; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..0de248202879 --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,1640 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, + * Juri Lelli <juri.lelli@gmail.com>, + * Michael Trimarchi <michael@amarulasolutions.com>, + * Fabio Checconi <fchecconi@gmail.com> + */ +#include "sched.h" + +#include <linux/slab.h> + +struct dl_bandwidth def_dl_bandwidth; + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ + return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + + return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ + return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ + struct sched_dl_entity *dl_se = &p->dl; + + return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ + raw_spin_lock_init(&dl_b->dl_runtime_lock); + dl_b->dl_period = period; + dl_b->dl_runtime = runtime; +} + +extern unsigned long to_ratio(u64 period, u64 runtime); + +void init_dl_bw(struct dl_bw *dl_b) +{ + raw_spin_lock_init(&dl_b->lock); + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); + if (global_rt_runtime() == RUNTIME_INF) + dl_b->bw = -1; + else + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + dl_b->total_bw = 0; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ + dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP + /* zero means no -deadline tasks */ + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + + dl_rq->dl_nr_migratory = 0; + dl_rq->overloaded = 0; + dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else + init_dl_bw(&dl_rq->dl_bw); +#endif +} + +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); + /* + * Must be visible before the overload count is + * set (as in sched_rt.c). + * + * Matched by the barrier in pull_dl_task(). + */ + smp_wmb(); + atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + atomic_dec(&rq->rd->dlo_count); + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (!dl_rq->overloaded) { + dl_set_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 1; + } + } else if (dl_rq->overloaded) { + dl_clear_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 0; + } +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total++; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory++; + + update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + dl_rq = &rq_of_dl_rq(dl_rq)->dl; + + dl_rq->dl_nr_total--; + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory--; + + update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node *parent = NULL; + struct task_struct *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct task_struct, + pushable_dl_tasks); + if (dl_entity_preempt(&p->dl, &entry->dl)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + + rb_link_node(&p->pushable_dl_tasks, parent, link); + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) + return; + + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + struct rb_node *next_node; + + next_node = rb_next(&p->pushable_dl_tasks); + dl_rq->pushable_dl_tasks_leftmost = next_node; + } + + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +#endif /* CONFIG_SMP */ + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + * - the absolute deadline of the entity has to be placed at + * current time + relative deadline; + * - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + + /* + * We use the regular wall clock time to set deadlines in the + * future; in fact, we must consider execution overheads (time + * spent on hardirq context, etc.). + */ + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + BUG_ON(pi_se->dl_runtime <= 0); + + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. + */ + if (dl_se->dl_deadline == 0) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } + + /* + * We keep moving the deadline away until we get some + * available runtime for the entity. This ensures correct + * handling of situations where the runtime overrun is + * arbitrary large. + */ + while (dl_se->runtime <= 0) { + dl_se->deadline += pi_se->dl_period; + dl_se->runtime += pi_se->dl_runtime; + } + + /* + * At this point, the deadline really should be "in + * the future" with respect to rq->clock. If it's + * not, we are, for some reason, lagging too much! + * Anyway, after having warn userspace abut that, + * we still try to keep the things running by + * resetting the deadline and the budget of the + * entity. + */ + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { + static bool lag_once = false; + + if (!lag_once) { + lag_once = true; + printk_sched("sched: DL replenish lagged to much\n"); + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks. + * + * This function returns true if: + * + * runtime / (deadline - t) > dl_runtime / dl_period , + * + * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, u64 t) +{ + u64 left, right; + + /* + * left and right are the two sides of the equation above, + * after a bit of shuffling to use multiplications instead + * of divisions. + * + * Note that none of the time values involved in the two + * multiplications are absolute: dl_deadline and dl_runtime + * are the relative deadline and the maximum runtime of each + * instance, runtime is the runtime left for the last instance + * and (deadline - t), since t is rq->clock, is the time left + * to the (absolute) deadline. Even if overflowing the u64 type + * is very unlikely to occur in both cases, here we scale down + * as we want to avoid that risk at all. Scaling down by 10 + * means that we reduce granularity to 1us. We are fine with it, + * since this is only a true/false check and, anyway, thinking + * of anything below microseconds resolution is actually fiction + * (but still we want to give the user that illusion >;). + */ + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + right = ((dl_se->deadline - t) >> DL_SCALE) * + (pi_se->dl_runtime >> DL_SCALE); + + return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * The arrival of a new instance needs special treatment, i.e., + * the actual scheduling parameters have to be "renewed". + */ + if (dl_se->dl_new) { + setup_new_dl_entity(dl_se, pi_se); + return; + } + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + ktime_t now, act; + ktime_t soft, hard; + unsigned long range; + s64 delta; + + if (boosted) + return 0; + /* + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. + */ + act = ns_to_ktime(dl_se->deadline); + now = hrtimer_cb_get_time(&dl_se->dl_timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); + + /* + * If the expiry time already passed, e.g., because the value + * chosen as the deadline is too small, don't even try to + * start the timer in the past! + */ + if (ktime_us_delta(act, now) < 0) + return 0; + + hrtimer_set_expires(&dl_se->dl_timer, act); + + soft = hrtimer_get_softexpires(&dl_se->dl_timer); + hard = hrtimer_get_expires(&dl_se->dl_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + dl_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + raw_spin_lock(&rq->lock); + + /* + * We need to take care of a possible races here. In fact, the + * task might have changed its scheduling policy to something + * different from SCHED_DEADLINE or changed its reservation + * parameters (through sched_setscheduler()). + */ + if (!dl_task(p) || dl_se->dl_new) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + dl_se->dl_throttled = 0; + if (p->on_rq) { + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif + } +unlock: + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->dl_timer; + + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); + int rorun = dl_se->runtime <= 0; + + if (!rorun && !dmiss) + return 0; + + /* + * If we are beyond our current deadline and we are still + * executing, then we have already used some of the runtime of + * the next instance. Thus, if we do not account that, we are + * stealing bandwidth from the system at each deadline miss! + */ + if (dmiss) { + dl_se->runtime = rorun ? dl_se->runtime : 0; + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; + } + + return 1; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + u64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + dl_se->runtime -= delta_exec; + if (dl_runtime_exceeded(rq, dl_se)) { + __dequeue_task_dl(rq, curr, 0); + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) + dl_se->dl_throttled = 1; + else + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + + if (!is_leftmost(curr, &rq->dl)) + resched_task(curr); + } + + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks + * would be able to exceed the shared quota. + * + * Account to the root rt group for now. + * + * The solution we're working towards is having the RT groups scheduled + * using deadline servers -- however there's a few nasties to figure + * out before that can happen. + */ + if (rt_bandwidth_enabled()) { + struct rt_rq *rt_rq = &rq->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + /* + * We'll let actual RT tasks worry about the overflow here, we + * have our own CBS to keep us inline -- see above. + */ + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } +} + +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + + if (next && dl_prio(next->prio)) + return next->dl.deadline; + else + return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + if (dl_rq->earliest_dl.curr == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + /* + * If the dl_rq had no -deadline tasks, or if the new task + * has shorter deadline than the current one on dl_rq, we + * know that the previous earliest becomes our next earliest, + * as the new task becomes the earliest itself. + */ + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; + dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + } else if (dl_rq->earliest_dl.next == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.next)) { + /* + * On the other hand, if the new -deadline task has a + * a later deadline than the earliest one on dl_rq, but + * it is earlier than the next (if any), we must + * recompute the next-earliest. + */ + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * Since we may have removed our earliest (and/or next earliest) + * task we must recompute them. + */ + if (!dl_rq->dl_nr_running) { + dl_rq->earliest_dl.curr = 0; + dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + } else { + struct rb_node *leftmost = dl_rq->rb_leftmost; + struct sched_dl_entity *entry; + + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); + dl_rq->earliest_dl.curr = entry->deadline; + dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + } +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + u64 deadline = dl_se->deadline; + + WARN_ON(!dl_prio(prio)); + dl_rq->dl_nr_running++; + + inc_dl_deadline(dl_rq, deadline); + inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + + WARN_ON(!dl_prio(prio)); + WARN_ON(!dl_rq->dl_nr_running); + dl_rq->dl_nr_running--; + + dec_dl_deadline(dl_rq, dl_se->deadline); + dec_dl_migration(dl_se, dl_rq); +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_dl_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_dl_entity, rb_node); + if (dl_time_before(dl_se->deadline, entry->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->rb_leftmost = &dl_se->rb_node; + + rb_link_node(&dl_se->rb_node, parent, link); + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + + inc_dl_tasks(dl_se, dl_rq); +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + if (RB_EMPTY_NODE(&dl_se->rb_node)) + return; + + if (dl_rq->rb_leftmost == &dl_se->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&dl_se->rb_node); + dl_rq->rb_leftmost = next_node; + } + + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + RB_CLEAR_NODE(&dl_se->rb_node); + + dec_dl_tasks(dl_se, dl_rq); +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, int flags) +{ + BUG_ON(on_dl_rq(dl_se)); + + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); + else + update_dl_entity(dl_se, pi_se); + + __enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + __dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct sched_dl_entity *pi_se = &p->dl; + + /* + * Use the scheduling parameters of the top pi-waiter + * task if we have one and its (relative) deadline is + * smaller than our one... OTW we keep our runtime and + * deadline. + */ + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + pi_se = &pi_task->dl; + + /* + * If p is throttled, we do nothing. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + */ + if (p->dl.dl_throttled) + return; + + enqueue_dl_entity(&p->dl, pi_se, flags); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); + + inc_nr_running(rq); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_dl_entity(&p->dl); + dequeue_pushable_dl_task(rq, p); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); + + dec_nr_running(rq); +} + +/* + * Yield task semantic for -deadline tasks is: + * + * get off from the CPU until our next instance, with + * a new runtime. This is of little use now, since we + * don't have a bandwidth reclaiming mechanism. Anyway, + * bandwidth reclaiming is planned for the future, and + * yield_task_dl will indicate that some spare budget + * is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * We make the task go to sleep until its current deadline by + * forcing its runtime to zero. This way, update_curr_dl() stops + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_new=1). + */ + if (p->dl.runtime > 0) { + rq->curr->dl.dl_new = 1; + p->dl.runtime = 0; + } + update_curr_dl(rq); +} + +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If we are dealing with a -deadline task, we must + * decide where to wake it up. + * If it has a later deadline and the current task + * on this rq can't move (provided the waking task + * can!) we prefer to send it somewhere else. On the + * other hand, if it has a shorter deadline, we + * try to make it stay here, it might be important. + */ + if (unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + (p->nr_cpus_allowed > 1)) { + int target = find_later_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + return; + + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + return; + + resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags) +{ + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * In the unlikely case current and p have the same deadline + * let us try to decide what's the best thing to do... + */ + if ((p->dl.deadline == rq->curr->dl.deadline) && + !test_tsk_need_resched(rq->curr)) + check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ + s64 delta = p->dl.dl_runtime - p->dl.runtime; + + if (delta > 10000) + hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, + struct dl_rq *dl_rq) +{ + struct rb_node *left = dl_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct sched_dl_entity *dl_se; + struct task_struct *p; + struct dl_rq *dl_rq; + + dl_rq = &rq->dl; + + if (unlikely(!dl_rq->dl_nr_running)) + return NULL; + + dl_se = pick_next_dl_entity(rq, dl_rq); + BUG_ON(!dl_se); + + p = dl_task_of(dl_se); + p->se.exec_start = rq_clock_task(rq); + + /* Running task will never be pushed. */ + dequeue_pushable_dl_task(rq, p); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); +#endif + +#ifdef CONFIG_SMP + rq->post_schedule = has_pushable_dl_tasks(rq); +#endif /* CONFIG_SMP */ + + return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ + update_curr_dl(rq); + + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ + /* + * SCHED_DEADLINE tasks cannot fork and this is achieved through + * sched_fork() + */ +} + +static void task_dead_dl(struct task_struct *p) +{ + struct hrtimer *timer = &p->dl.dl_timer; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + /* + * Since we are TASK_DEAD we won't slip out of the domain! + */ + raw_spin_lock_irq(&dl_b->lock); + dl_b->total_bw -= p->dl.dl_bw; + raw_spin_unlock_irq(&dl_b->lock); + + hrtimer_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + + return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.rb_leftmost; + struct sched_dl_entity *dl_se; + struct task_struct *p = NULL; + +next_node: + next_node = rb_next(next_node); + if (next_node) { + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); + p = dl_task_of(dl_se); + + if (pick_dl_task(rq, p, cpu)) + return p; + + goto next_node; + } + + return NULL; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + int this_cpu = smp_processor_id(); + int best_cpu, cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!later_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; + + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task, later_mask); + if (best_cpu == -1) + return -1; + + /* + * If we are here, some target has been found, + * the most suitable of which is cached in best_cpu. + * This is, among the runqueues where the current tasks + * have later deadlines than the task's one, the rq + * with the latest possible one. + * + * Now we check how well this matches with task's + * affinity and system topology. + * + * The last cpu where the task run is our first + * guess, since it is most likely cache-hot there. + */ + if (cpumask_test_cpu(cpu, later_mask)) + return cpu; + /* + * Check if this_cpu is to be skipped (i.e., it is + * not in the mask) or not. + */ + if (!cpumask_test_cpu(this_cpu, later_mask)) + this_cpu = -1; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + + /* + * If possible, preempting this_cpu is + * cheaper than migrating. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + /* + * Last chance: if best_cpu is valid and is + * in the mask, that becomes our choice. + */ + if (best_cpu < nr_cpu_ids && + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * At this point, all our guesses failed, we just return + * 'something', and let the caller sort the things out. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + + return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *later_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < DL_MAX_TRIES; tries++) { + cpu = find_later_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + later_rq = cpu_rq(cpu); + + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(later_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || !task->on_rq)) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; + } + } + + /* + * If the rq we found has no -deadline task, or + * its earliest one has a later deadline than our + * task, the rq is a good one. + */ + if (!later_rq->dl.dl_nr_running || + dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) + break; + + /* Otherwise we try again. */ + double_unlock_balance(rq, later_rq); + later_rq = NULL; + } + + return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!dl_task(p)); + + return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *later_rq; + + if (!rq->dl.overloaded) + return 0; + + next_task = pick_next_pushable_dl_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * If next_task preempts rq->curr, and rq->curr + * can move away, it makes sense to just reschedule + * without going further in pushing next_task. + */ + if (dl_task(rq->curr) && + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + rq->curr->nr_cpus_allowed > 1) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* Will lock the rq it'll find */ + later_rq = find_lock_later_rq(next_task, rq); + if (!later_rq) { + struct task_struct *task; + + /* + * We must check all this again, since + * find_lock_later_rq releases rq->lock and it is + * then possible that next_task has migrated. + */ + task = pick_next_pushable_dl_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task is still there. We don't try + * again, some other cpu will pull it when ready. + */ + dequeue_pushable_dl_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks */ + goto out; + + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, later_rq->cpu); + activate_task(later_rq, next_task, 0); + + resched_task(later_rq->curr); + + double_unlock_balance(rq, later_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ + /* Terminates as it moves a -deadline task */ + while (push_dl_task(rq)) + ; +} + +static int pull_dl_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + u64 dmin = LONG_MAX; + + if (likely(!dl_overloaded(this_rq))) + return 0; + + /* + * Match the barrier from dl_set_overloaded; this guarantees that if we + * see overloaded we must also see the dlo_mask bit. + */ + smp_rmb(); + + for_each_cpu(cpu, this_rq->rd->dlo_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * It looks racy, abd it is! However, as in sched_rt.c, + * we are fine with this. + */ + if (this_rq->dl.dl_nr_running && + dl_time_before(this_rq->dl.earliest_dl.curr, + src_rq->dl.earliest_dl.next)) + continue; + + /* Might drop this_rq->lock */ + double_lock_balance(this_rq, src_rq); + + /* + * If there are no more pullable tasks on the + * rq, we're done with it. + */ + if (src_rq->dl.dl_nr_running <= 1) + goto skip; + + p = pick_next_earliest_dl_task(src_rq, this_cpu); + + /* + * We found a task to be pulled if: + * - it preempts our current (if there's one), + * - it will preempt the last one we pulled (if any). + */ + if (p && dl_time_before(p->dl.deadline, dmin) && + (!this_rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + this_rq->dl.earliest_dl.curr))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * Then we pull iff p has actually an earlier + * deadline than the current task of its runqueue. + */ + if (dl_time_before(p->dl.deadline, + src_rq->curr->dl.deadline)) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + + /* Is there any other task even earlier? */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull other tasks here */ + if (dl_task(prev)) + pull_dl_task(rq); +} + +static void post_schedule_dl(struct rq *rq) +{ + push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_dl_tasks(rq) && + p->nr_cpus_allowed > 1 && + dl_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + dl_entity_preempt(&rq->curr->dl, &p->dl))) { + push_dl_tasks(rq); + } +} + +static void set_cpus_allowed_dl(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!dl_task(p)); + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_dl_task(rq, p); + BUG_ON(!rq->dl.dl_nr_migratory); + rq->dl.dl_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_dl_task(rq, p); + rq->dl.dl_nr_migratory++; + } + + update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +} + +void init_sched_dl_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), + GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) + hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP + /* + * Since this might be the only -deadline task on the rq, + * this is the right place to try to pull some other one + * from an overloaded cpu, if any. + */ + if (!rq->dl.dl_nr_running) + pull_dl_task(rq); +#endif +} + +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If p is throttled, don't consider the possibility + * of preempting rq->curr, the check will be done right + * after its runtime will get replenished. + */ + if (unlikely(p->dl.dl_throttled)) + return; + + if (p->on_rq || rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + /* Only reschedule if pushing failed */ + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + } +} + +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ +static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) +{ + if (p->on_rq || rq->curr == p) { +#ifdef CONFIG_SMP + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + pull_dl_task(rq); + + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this + * runqueue. + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && + rq->curr == p) + resched_task(p); +#else + /* + * Again, we don't know if p has a earlier + * or later deadline, so let's blindly set a + * (maybe not needed) rescheduling point. + */ + resched_task(p); +#endif /* CONFIG_SMP */ + } else + switched_to_dl(rq, p); +} + +const struct sched_class dl_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_dl, + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + + .check_preempt_curr = check_preempt_curr_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .pre_schedule = pre_schedule_dl, + .post_schedule = post_schedule_dl, + .task_woken = task_woken_dl, +#endif + + .set_curr_task = set_curr_task_dl, + .task_tick = task_tick_dl, + .task_fork = task_fork_dl, + .task_dead = task_dead_dl, + + .prio_changed = prio_changed_dl, + .switched_from = switched_from_dl, + .switched_to = switched_to_dl, +}; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7c..dd52e7ffb10e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> +#include <linux/mempolicy.h> #include "sched.h" @@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d", task_node(p)); +#endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); #endif @@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->on_rq || task_cpu(p) != rq_cpu) + if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); @@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif +#ifdef CONFIG_CFS_BANDWIDTH + SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", + cfs_rq->tg->cfs_bandwidth.timer_active); + SEQ_printf(m, " .%-30s: %d\n", "throttled", + cfs_rq->throttled); + SEQ_printf(m, " .%-30s: %d\n", "throttle_count", + cfs_rq->throttle_count); +#endif #ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); @@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m) cpu_clk = local_clock(); local_irq_restore(flags); - SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -359,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) PN(cpu_clk); P(jiffies); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); + P(sched_clock_stable()); #endif #undef PN #undef P @@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void) __initcall(init_sched_debug_procfs); +#define __P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + +static void sched_show_numa(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_NUMA_BALANCING + struct mempolicy *pol; + int node, i; + + if (p->mm) + P(mm->numa_scan_seq); + + task_lock(p); + pol = p->mempolicy; + if (pol && !(pol->flags & MPOL_F_MORON)) + pol = NULL; + mpol_get(pol); + task_unlock(p); + + SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); + + for_each_online_node(node) { + for (i = 0; i < 2; i++) { + unsigned long nr_faults = -1; + int cpu_current, home_node; + + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; + + cpu_current = !i ? (task_node(p) == node) : + (pol && node_isset(node, pol->v.nodes)); + + home_node = (p->numa_preferred_nid == node); + + SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + i, node, cpu_current, home_node, nr_faults); + } + } + + mpol_put(pol); +#endif +} + void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; @@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } + + sched_show_numa(p, m); } void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..867b0a4b0893 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,59 +178,61 @@ void sched_init_granularity(void) update_sysctl(); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - +#define WMULT_CONST (~0U) #define WMULT_SHIFT 32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ + unsigned long w; + + if (likely(lw->inv_weight)) + return; + + w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; +} /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + * OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive. */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; + u64 fact = scale_load_down(weight); + int shift = WMULT_SHIFT; - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); + __update_inv_weight(lw); - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; + if (unlikely(fact >> 32)) { + while (fact >> 32) { + fact >>= 1; + shift--; + } } - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + /* hint to use a 32x32->64 mul */ + fact = (u64)(u32)fact * lw->inv_weight; + + while (fact >> 32) { + fact >>= 1; + shift--; + } - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); + return mul_u64_u32_shr(delta_exec, fact, shift); } @@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, /* * delta /= w */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } @@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&lw, se->load.weight); load = &lw; } - slice = calc_delta_mine(slice, se->load.weight, load); + slice = __calc_delta(slice, se->load.weight, load); } return slice; } @@ -681,6 +682,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); + static inline void __update_task_entity_contrib(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ @@ -701,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p) #endif /* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. + * Update the current task's runtime statistics. */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); -} - static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_clock_task(rq_of(cfs_rq)); - unsigned long delta_exec; + u64 delta_exec; if (unlikely(!curr)) return; - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) return; - __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -818,11 +806,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size. */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -830,41 +819,830 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -static void task_numa_placement(struct task_struct *p) +/* + * After skipping a page migration on a shared page, skip N more numa page + * migrations unconditionally. This reduces the number of NUMA migrations + * in shared memory workloads, and has the effect of pulling tasks towards + * where their memory lives, over pulling the memory towards the task. + */ +unsigned int sysctl_numa_balancing_migrate_deferred = 16; + +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ + unsigned int scan, floor; + unsigned int windows = 1; + + if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ + unsigned int smin = task_scan_min(p); + unsigned int smax; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + return max(smin, smax); +} + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +struct numa_group { + atomic_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + struct list_head task_list; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long faults[0]; +}; + +pid_t task_numa_group_id(struct task_struct *p) +{ + return p->numa_group ? p->numa_group->gid : 0; +} + +static inline int task_faults_idx(int nid, int priv) +{ + return 2 * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults) + return 0; + + return p->numa_faults[task_faults_idx(nid, 0)] + + p->numa_faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ + if (!p->numa_group) + return 0; + + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node. The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ + unsigned long total_faults; + + if (!p->numa_faults) + return 0; + + total_faults = p->total_numa_faults; + + if (!total_faults) + return 0; + + return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ + if (!p->numa_group || !p->numa_group->total_faults) + return 0; + + return 1000 * group_faults(p, nid) / p->numa_group->total_faults; +} + +static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long power_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { + unsigned long nr_running; + unsigned long load; + + /* Total compute capacity of CPUs on a node */ + unsigned long power; + + /* Approximate capacity in terms of runnable tasks on a node */ + unsigned long capacity; + int has_capacity; +}; + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid) +{ + int cpu, cpus = 0; + + memset(ns, 0, sizeof(*ns)); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->nr_running += rq->nr_running; + ns->load += weighted_cpuload(cpu); + ns->power += power_of(cpu); + + cpus++; + } + + /* + * If we raced with hotplug and there are no CPUs left in our mask + * the @ns structure is NULL'ed and task_numa_compare() will + * not find this node attractive. + * + * We'll either bail at !has_capacity, or we'll detect a huge imbalance + * and bail there. + */ + if (!cpus) + return; + + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; + ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); + ns->has_capacity = (ns->nr_running < ns->capacity); +} + +struct task_numa_env { + struct task_struct *p; + + int src_cpu, src_nid; + int dst_cpu, dst_nid; + + struct numa_stats src_stats, dst_stats; + + int imbalance_pct; + + struct task_struct *best_task; + long best_imp; + int best_cpu; +}; + +static void task_numa_assign(struct task_numa_env *env, + struct task_struct *p, long imp) +{ + if (env->best_task) + put_task_struct(env->best_task); + if (p) + get_task_struct(p); + + env->best_task = p; + env->best_imp = imp; + env->best_cpu = env->dst_cpu; +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp) +{ + struct rq *src_rq = cpu_rq(env->src_cpu); + struct rq *dst_rq = cpu_rq(env->dst_cpu); + struct task_struct *cur; + long dst_load, src_load; + long load; + long imp = (groupimp > 0) ? groupimp : taskimp; + + rcu_read_lock(); + cur = ACCESS_ONCE(dst_rq->curr); + if (cur->pid == 0) /* idle */ + cur = NULL; + + /* + * "imp" is the fault differential for the source task between the + * source and destination node. Calculate the total differential for + * the source task and potential destination task. The more negative + * the value is, the more rmeote accesses that would be expected to + * be incurred if the tasks were swapped. + */ + if (cur) { + /* Skip this swap candidate if cannot move to the source cpu */ + if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) + goto unlock; + + /* + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + if (cur->numa_group == env->p->numa_group) { + imp = taskimp + task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ + if (cur->numa_group) + imp -= imp/16; + } else { + /* + * Compare the group weights. If a task is all by + * itself (not part of a group), use the task weight + * instead. + */ + if (env->p->numa_group) + imp = groupimp; + else + imp = taskimp; + + if (cur->numa_group) + imp += group_weight(cur, env->src_nid) - + group_weight(cur, env->dst_nid); + else + imp += task_weight(cur, env->src_nid) - + task_weight(cur, env->dst_nid); + } + } + + if (imp < env->best_imp) + goto unlock; + + if (!cur) { + /* Is there capacity at our destination? */ + if (env->src_stats.has_capacity && + !env->dst_stats.has_capacity) + goto unlock; + + goto balance; + } + + /* Balance doesn't matter much if we're running a task per cpu */ + if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + goto assign; + + /* + * In the overloaded case, try and keep the load balanced. + */ +balance: + dst_load = env->dst_stats.load; + src_load = env->src_stats.load; + + /* XXX missing power terms */ + load = task_h_load(env->p); + dst_load += load; + src_load -= load; + + if (cur) { + load = task_h_load(cur); + dst_load -= load; + src_load += load; + } + + /* make src_load the smaller */ + if (dst_load < src_load) + swap(dst_load, src_load); + + if (src_load * env->imbalance_pct < dst_load * 100) + goto unlock; + +assign: + task_numa_assign(env, cur, imp); +unlock: + rcu_read_unlock(); +} + +static void task_numa_find_cpu(struct task_numa_env *env, + long taskimp, long groupimp) +{ + int cpu; + + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) + continue; + + env->dst_cpu = cpu; + task_numa_compare(env, taskimp, groupimp); + } +} + +static int task_numa_migrate(struct task_struct *p) +{ + struct task_numa_env env = { + .p = p, + + .src_cpu = task_cpu(p), + .src_nid = task_node(p), + + .imbalance_pct = 112, + + .best_task = NULL, + .best_imp = 0, + .best_cpu = -1 + }; + struct sched_domain *sd; + unsigned long taskweight, groupweight; + int nid, ret; + long taskimp, groupimp; + + /* + * Pick the lowest SD_NUMA domain, as that would have the smallest + * imbalance and would be the first to start moving tasks about. + * + * And we want to avoid any moving of tasks about, as that would create + * random movement of tasks -- counter the numa conditions we're trying + * to satisfy here. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + rcu_read_unlock(); + + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + p->numa_preferred_nid = task_node(p); + return -EINVAL; + } + + taskweight = task_weight(p, env.src_nid); + groupweight = group_weight(p, env.src_nid); + update_numa_stats(&env.src_stats, env.src_nid); + env.dst_nid = p->numa_preferred_nid; + taskimp = task_weight(p, env.dst_nid) - taskweight; + groupimp = group_weight(p, env.dst_nid) - groupweight; + update_numa_stats(&env.dst_stats, env.dst_nid); + + /* If the preferred nid has capacity, try to use it. */ + if (env.dst_stats.has_capacity) + task_numa_find_cpu(&env, taskimp, groupimp); + + /* No space available on the preferred nid. Look elsewhere. */ + if (env.best_cpu == -1) { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; + + /* Only consider nodes where both task and groups benefit */ + taskimp = task_weight(p, nid) - taskweight; + groupimp = group_weight(p, nid) - groupweight; + if (taskimp < 0 && groupimp < 0) + continue; + + env.dst_nid = nid; + update_numa_stats(&env.dst_stats, env.dst_nid); + task_numa_find_cpu(&env, taskimp, groupimp); + } + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; + + sched_setnuma(p, env.dst_nid); + + /* + * Reset the scan period if the task is being rescheduled on an + * alternative node to recheck if the tasks is now properly placed. + */ + p->numa_scan_period = task_scan_min(p); + + if (env.best_task == NULL) { + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + return ret; + } + + ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + put_task_struct(env.best_task); + return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + return; + + /* Periodically retry migrating the task to the preferred node */ + p->numa_migrate_retry = jiffies + HZ; + + /* Success if task is already running on preferred CPU */ + if (task_node(p) == p->numa_preferred_nid) + return; + + /* Otherwise, try migrate to a CPU on the preferred node */ + task_numa_migrate(p); +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) { - int seq; + unsigned int period_slot; + int ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is areas that are not of interest + * to automatic numa balancing. Scan slower + */ + if (local + shared == 0) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); - if (!p->mm) /* for example, ksmd faulting in a user's mm */ return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + if (ratio >= NUMA_PERIOD_THRESHOLD) { + int slot = ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + + /* + * Scale scan rate increases based on sharing. There is an + * inverse relationship between the degree of sharing and + * the adjustment made to the scanning period. Broadly + * speaking the intent is that there is little point + * scanning faster if shared accesses dominate as it may + * simply bounce migrations uselessly + */ + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + diff = (diff * ratio) / NUMA_PERIOD_SLOTS; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +static void task_numa_placement(struct task_struct *p) +{ + int seq, nid, max_nid = -1, max_group_nid = -1; + unsigned long max_faults = 0, max_group_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; + spinlock_t *group_lock = NULL; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; + p->numa_scan_period_max = task_scan_max(p); + + /* If the task is part of a group prevent parallel updates to group stats */ + if (p->numa_group) { + group_lock = &p->numa_group->lock; + spin_lock(group_lock); + } + + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + unsigned long faults = 0, group_faults = 0; + int priv, i; + + for (priv = 0; priv < 2; priv++) { + long diff; + + i = task_faults_idx(nid, priv); + diff = -p->numa_faults[i]; + + /* Decay existing window, copy faults since last scan */ + p->numa_faults[i] >>= 1; + p->numa_faults[i] += p->numa_faults_buffer[i]; + fault_types[priv] += p->numa_faults_buffer[i]; + p->numa_faults_buffer[i] = 0; + + faults += p->numa_faults[i]; + diff += p->numa_faults[i]; + p->total_numa_faults += diff; + if (p->numa_group) { + /* safe because we can only change our own group */ + p->numa_group->faults[i] += diff; + p->numa_group->total_faults += diff; + group_faults += p->numa_group->faults[i]; + } + } + + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + + if (group_faults > max_group_faults) { + max_group_faults = group_faults; + max_group_nid = nid; + } + } + + update_task_scan_period(p, fault_types[0], fault_types[1]); + + if (p->numa_group) { + /* + * If the preferred task and group nids are different, + * iterate over the nodes again to find the best place. + */ + if (max_nid != max_group_nid) { + unsigned long weight, max_weight = 0; + + for_each_online_node(nid) { + weight = task_weight(p, nid) + group_weight(p, nid); + if (weight > max_weight) { + max_weight = weight; + max_nid = nid; + } + } + } + + spin_unlock(group_lock); + } + + /* Preferred node as the node with the most faults */ + if (max_faults && max_nid != p->numa_preferred_nid) { + /* Update the preferred nid and migrate task if possible */ + sched_setnuma(p, max_nid); + numa_migrate_preferred(p); + } +} + +static inline int get_numa_group(struct numa_group *grp) +{ + return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (atomic_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int *priv) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!p->numa_group)) { + unsigned int size = sizeof(struct numa_group) + + 2*nr_node_ids*sizeof(unsigned long); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + atomic_set(&grp->refcount, 1); + spin_lock_init(&grp->lock); + INIT_LIST_HEAD(&grp->task_list); + grp->gid = p->pid; - /* FIXME: Scheduling placement policy hints go here */ + for (i = 0; i < 2*nr_node_ids; i++) + grp->faults[i] = p->numa_faults[i]; + + grp->total_faults = p->total_numa_faults; + + list_add(&p->numa_entry, &grp->task_list); + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto no_join; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto no_join; + + my_grp = p->numa_group; + if (grp == my_grp) + goto no_join; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto no_join; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto no_join; + + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; + + /* Update priv based on whether false sharing was detected */ + *priv = !join; + + if (join && !get_numa_group(grp)) + goto no_join; + + rcu_read_unlock(); + + if (!join) + return; + + double_lock(&my_grp->lock, &grp->lock); + + for (i = 0; i < 2*nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; + } + my_grp->total_faults -= p->total_numa_faults; + grp->total_faults += p->total_numa_faults; + + list_move(&p->numa_entry, &grp->task_list); + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); + return; + +no_join: + rcu_read_unlock(); + return; +} + +void task_numa_free(struct task_struct *p) +{ + struct numa_group *grp = p->numa_group; + int i; + void *numa_faults = p->numa_faults; + + if (grp) { + spin_lock(&grp->lock); + for (i = 0; i < 2*nr_node_ids; i++) + grp->faults[i] -= p->numa_faults[i]; + grp->total_faults -= p->total_numa_faults; + + list_del(&p->numa_entry); + grp->nr_tasks--; + spin_unlock(&grp->lock); + rcu_assign_pointer(p->numa_group, NULL); + put_numa_group(grp); + } + + p->numa_faults = NULL; + p->numa_faults_buffer = NULL; + kfree(numa_faults); } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, int flags) { struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; + int priv; if (!numabalancing_enabled) return; - /* FIXME: Allocate task-specific structure for placement policy here */ + /* for example, ksmd faulting in a user's mm */ + if (!p->mm) + return; + + /* Do not worry about placement if exiting */ + if (p->state == TASK_DEAD) + return; + + /* Allocate buffer to track faults on a per-node basis */ + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + + /* numa_faults and numa_faults_buffer share the allocation */ + p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) + return; + + BUG_ON(p->numa_faults_buffer); + p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); + } /* - * If pages are properly placed (did not migrate) then scan slower. - * This is reset periodically in case of phase changes + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed */ - if (!migrated) - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(10)); + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv && !(flags & TNF_NO_GROUP)) + task_numa_group(p, last_cpupid, flags, &priv); + } task_numa_placement(p); + + /* + * Retry task to preferred node migration periodically, in case it + * case it previously failed, or the scheduler moved us. + */ + if (time_after(jiffies, p->numa_migrate_retry)) + numa_migrate_preferred(p); + + if (migrated) + p->numa_pages_migrated += pages; + + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -884,6 +1662,7 @@ void task_numa_work(struct callback_head *work) struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; + unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -900,35 +1679,9 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; - /* - * We do not care about task placement until a task runs on a node - * other than the first one used by the address space. This is - * largely because migrations are driven by what CPU the task - * is running on. If it's never scheduled on another node, it'll - * not migrate so why bother trapping the fault. - */ - if (mm->first_nid == NUMA_PTE_SCAN_INIT) - mm->first_nid = numa_node_id(); - if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { - /* Are we running on a new node yet? */ - if (numa_node_id() == mm->first_nid && - !sched_feat_numa(NUMA_FORCE)) - return; - - mm->first_nid = NUMA_PTE_SCAN_ACTIVE; - } - - /* - * Reset the scan period if enough time has gone by. Objective is that - * scanning will be reduced if pages are properly placed. As tasks - * can enter different phases this needs to be re-examined. Lacking - * proper tracking of reference behaviour, this blunt hammer is used. - */ - migrate = mm->numa_next_reset; - if (time_after(now, migrate)) { - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; - next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); - xchg(&mm->numa_next_reset, next_scan); + if (!mm->numa_next_scan) { + mm->numa_next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); } /* @@ -938,20 +1691,20 @@ void task_numa_work(struct callback_head *work) if (time_before(now, migrate)) return; - if (p->numa_scan_period == 0) - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + if (p->numa_scan_period == 0) { + p->numa_scan_period_max = task_scan_max(p); + p->numa_scan_period = task_scan_min(p); + } next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; /* - * Do not set pte_numa if the current running node is rate-limited. - * This loses statistics on the fault but if we are unwilling to - * migrate to this node, it is less likely we can do useful work + * Delay this task enough that another task of this mm will likely win + * the next time around. */ - if (migrate_ratelimited(numa_node_id())) - return; + p->node_stamp += 2 * TICK_NSEC; start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; @@ -967,18 +1720,39 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; - /* Skip small VMAs. They are not likely to be of relevance */ - if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + /* + * Shared library pages mapped by multiple processes are not + * migrated as it is expected they are cache replicated. Avoid + * hinting faults in read-only file-backed mappings or the vdso + * as migrating the pages will be of marginal benefit. + */ + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + continue; + + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) continue; do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - pages -= change_prot_numa(vma, start, end); + nr_pte_updates += change_prot_numa(vma, start, end); + + /* + * Scan sysctl_numa_balancing_scan_size but ensure that + * at least one PTE is updated so that unused virtual + * address space is quickly skipped. + */ + if (nr_pte_updates) + pages -= (end - start) >> PAGE_SHIFT; start = end; if (pages <= 0) @@ -988,10 +1762,10 @@ void task_numa_work(struct callback_head *work) out: /* - * It is possible to reach the end of the VMA list but the last few VMAs are - * not guaranteed to the vma_migratable. If they are not, we would find the - * !migratable VMA on the next scan but not reset the scanner to the start - * so check it now. + * It is possible to reach the end of the VMA list but the last few + * VMAs are not guaranteed to the vma_migratable. If they are not, we + * would find the !migratable VMA on the next scan but not reset the + * scanner to the start so check it now. */ if (vma) mm->numa_scan_offset = start; @@ -1025,8 +1799,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) if (now - curr->node_stamp > period) { if (!curr->node_stamp) - curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; - curr->node_stamp = now; + curr->numa_scan_period = task_scan_min(curr); + curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1038,6 +1812,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -1047,8 +1829,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP - if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + if (entity_is_task(se)) { + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } #endif cfs_rq->nr_running++; } @@ -1059,8 +1845,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); + } cfs_rq->nr_running--; } @@ -1378,7 +2166,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; @@ -2070,13 +2858,14 @@ static inline bool cfs_bandwidth_used(void) return static_key_false(&__cfs_bandwidth_used); } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void) +{ + static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) { - /* only need to count groups transitioning between enabled/!enabled */ - if (enabled && !was_enabled) - static_key_slow_inc(&__cfs_bandwidth_used); - else if (!enabled && was_enabled) - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -2084,7 +2873,8 @@ static bool cfs_bandwidth_used(void) return true; } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} #endif /* HAVE_JUMP_LABEL */ /* @@ -2213,8 +3003,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; @@ -2232,7 +3021,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2335,6 +3124,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); } @@ -2448,6 +3239,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (idle) goto out_unlock; + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; + __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -2508,7 +3306,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -2584,10 +3388,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) u64 expires; /* confirm we're still not at a refresh boundary */ - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); return; + } - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { runtime = cfs_b->runtime; cfs_b->runtime = 0; @@ -2708,11 +3514,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * (timer_active==0 becomes visible before the hrtimer call-back * terminates). In either case we ensure that it's re-programmed */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - + cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ if (cfs_b->timer_active) @@ -2755,8 +3561,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)); } -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -3166,8 +3971,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) } #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { return wl; } @@ -3292,12 +4096,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -3420,11 +4228,10 @@ done: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; @@ -3466,7 +4273,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -3475,10 +4281,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -3904,9 +4707,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +enum fbq_type { regular, remote, all }; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 struct lb_env { struct sched_domain *sd; @@ -3929,6 +4735,8 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; + + enum fbq_type fbq_type; }; /* @@ -3975,6 +4783,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ + int src_nid, dst_nid; + + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + !(env->sd->flags & SD_NUMA)) { + return false; + } + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + /* Always encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return true; + + /* If both task and group weight improve, this move is a winner. */ + if (task_weight(p, dst_nid) > task_weight(p, src_nid) && + group_weight(p, dst_nid) > group_weight(p, src_nid)) + return true; + + return false; +} + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ + int src_nid, dst_nid; + + if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) + return false; + + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + return false; + + src_nid = cpu_to_node(env->src_cpu); + dst_nid = cpu_to_node(env->dst_cpu); + + if (src_nid == dst_nid) + return false; + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) + return true; + + /* If either task or group weight get worse, don't do it. */ + if (task_weight(p, dst_nid) < task_weight(p, src_nid) || + group_weight(p, dst_nid) < group_weight(p, src_nid)) + return true; + + return false; +} + +#else +static inline bool migrate_improves_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} + +static inline bool migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) +{ + return false; +} +#endif + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -3997,6 +4877,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + env->flags |= LBF_SOME_PINNED; + /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +4887,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_SOME_PINNED; + env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } @@ -4030,11 +4912,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. + * 1) destination numa is preferred + * 2) task is cache cold, or + * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + if (!tsk_cache_hot) + tsk_cache_hot = migrate_degrades_locality(p, env); + + if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4077,8 +4972,6 @@ static int move_one_task(struct lb_env *env) return 0; } -static unsigned long task_h_load(struct task_struct *p); - static const unsigned int sched_nr_migrate_break = 32; /* @@ -4291,6 +5184,10 @@ struct sg_lb_stats { unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif }; /* @@ -4330,7 +5227,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained. * * Return: The load index. */ @@ -4447,7 +5344,7 @@ void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power; + unsigned long power, power_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -4459,7 +5356,7 @@ void update_group_power(struct sched_domain *sd, int cpu) return; } - power = 0; + power_orig = power = 0; if (child->flags & SD_OVERLAP) { /* @@ -4467,8 +5364,33 @@ void update_group_power(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) - power += power_of(cpu); + for_each_cpu(cpu, sched_group_cpus(sdg)) { + struct sched_group_power *sgp; + struct rq *rq = cpu_rq(cpu); + + /* + * build_sched_domains() -> init_sched_groups_power() + * gets here before we've attached the domains to the + * runqueues. + * + * Use power_of(), which is set irrespective of domains + * in update_cpu_power(). + * + * This avoids power/power_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct power_orig. + */ + if (unlikely(!rq->sd)) { + power_orig += power_of(cpu); + power += power_of(cpu); + continue; + } + + sgp = rq->sd->groups->sgp; + power_orig += sgp->power_orig; + power += sgp->power; + } } else { /* * !SD_OVERLAP domains can assume that child groups @@ -4477,12 +5399,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { + power_orig += group->sgp->power_orig; power += group->sgp->power; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = sdg->sgp->power = power; + sdg->sgp->power_orig = power_orig; + sdg->sgp->power = power; } /* @@ -4526,13 +5450,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * update_sd_pick_busiest(). And calculate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +5463,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * subtle and fragile situation. */ -struct sg_imb_stats { - unsigned long max_nr_running, min_nr_running; - unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group) { - sgi->max_cpu_load = sgi->max_nr_running = 0UL; - sgi->min_cpu_load = sgi->min_nr_running = ~0UL; + return group->sgp->imbalance; } -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, - unsigned long load, unsigned long nr_running) +/* + * Compute the group capacity. + * + * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit power unit capacity with that. + */ +static inline int sg_capacity(struct lb_env *env, struct sched_group *group) { - if (load > sgi->max_cpu_load) - sgi->max_cpu_load = load; - if (sgi->min_cpu_load > load) - sgi->min_cpu_load = load; + unsigned int capacity, smt, cpus; + unsigned int power, power_orig; - if (nr_running > sgi->max_nr_running) - sgi->max_nr_running = nr_running; - if (sgi->min_nr_running > nr_running) - sgi->min_nr_running = nr_running; -} + power = group->sgp->power; + power_orig = group->sgp->power_orig; + cpus = group->group_weight; -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && - (sgi->max_nr_running - sgi->min_nr_running) > 1) - return 1; + /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + capacity = cpus / smt; /* cores */ - return 0; + capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + if (!capacity) + capacity = fix_small_capacity(env->sd, group); + + return capacity; } /** @@ -4597,37 +5507,31 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - struct sg_imb_stats sgi; - unsigned long nr_running; unsigned long load; int i; - init_sg_imb_stats(&sgi); + memset(sgs, 0, sizeof(*sgs)); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - /* Bias balancing toward cpus of our domain */ - if (local_group) { + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - update_sg_imb_stats(&sgi, load, nr_running); - } sgs->group_load += load; - sgs->sum_nr_running += nr_running; + sgs->sum_nr_running += rq->nr_running; +#ifdef CONFIG_NUMA_BALANCING + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; +#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; } - if (local_group && (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, group->sgp->next_update))) - update_group_power(env->sd, env->dst_cpu); - /* Adjust by relative CPU power of the group */ sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -4635,16 +5539,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(sgs, &sgi); - - sgs->group_capacity = - DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - - if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(env->sd, group); - sgs->group_weight = group->group_weight; + sgs->group_imb = sg_imbalanced(group); + sgs->group_capacity = sg_capacity(env, group); + if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; } @@ -4693,14 +5592,42 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->nr_numa_running) + return regular; + if (sgs->sum_nr_running > sgs->nr_preferred_running) + return remote; + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + if (rq->nr_running > rq->nr_numa_running) + return regular; + if (rq->nr_running > rq->nr_preferred_running) + return remote; + return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ + return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ + return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct lb_env *env, - struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -4720,11 +5647,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, if (local_group) { sds->local = sg; sgs = &sds->local_stat; + + if (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, sg->sgp->next_update)) + update_group_power(env->sd, env->dst_cpu); } - memset(sgs, 0, sizeof(*sgs)); update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + if (local_group) + goto next_group; + /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try @@ -4735,21 +5668,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && - sds->local && sds->local_stat.group_has_capacity) + if (prefer_sibling && sds->local && + sds->local_stat.group_has_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); - /* Now, start updating sd_lb_stats */ - sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; - - if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } +next_group: + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sgs->group_power; + sg = sg->next; } while (sg != env->sd->groups); + + if (env->sd->flags & SD_NUMA) + env->fbq_type = fbq_classify_group(&sds->busiest_stat); } /** @@ -5053,15 +5990,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, - SCHED_POWER_SCALE); - unsigned long wl; + unsigned long power, capacity, wl; + enum fbq_type rt; + + rq = cpu_rq(i); + rt = fbq_classify_rq(rq); + + /* + * We classify groups/runqueues into three groups: + * - regular: there are !numa tasks + * - remote: there are numa tasks that run on the 'wrong' node + * - all: there is no distinction + * + * In order to avoid migrating ideally placed numa tasks, + * ignore those when there's better options. + * + * If we ignore the actual busiest queue to migrate another + * task, the next balance pass can still reduce the busiest + * queue by moving tasks around inside the node. + * + * If we cannot move enough load due to this classification + * the next pass will adjust the group classification and + * allow migration of more tasks. + * + * Both cases only affect the total convergence complexity. + */ + if (rt > env->fbq_type) + continue; + power = power_of(i); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (!capacity) capacity = fix_small_capacity(env->sd, group); - rq = cpu_rq(i); wl = weighted_cpuload(i); /* @@ -5164,6 +6125,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5177,6 +6139,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, + .fbq_type = all, }; /* @@ -5268,17 +6231,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_SOME_PINNED; + env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; - /* Prevent to re-select dst_cpu via env's cpus */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); - /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5286,6 +6249,18 @@ more_balance: goto more_balance; } + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgp->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; + } + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5393,6 +6368,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + u64 curr_cost = 0; this_rq->idle_stamp = rq_clock(this_rq); @@ -5409,15 +6385,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) for_each_domain(this_cpu, sd) { unsigned long interval; int continue_balancing = 1; + u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + break; + if (sd->flags & SD_BALANCE_NEWIDLE) { + t0 = sched_clock_cpu(this_cpu); + /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); + + domain_cost = sched_clock_cpu(this_cpu) - t0; + if (domain_cost > sd->max_newidle_lb_cost) + sd->max_newidle_lb_cost = domain_cost; + + curr_cost += domain_cost; } interval = msecs_to_jiffies(sd->balance_interval); @@ -5439,6 +6427,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } + + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; } /* @@ -5522,7 +6513,7 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -5537,13 +6528,13 @@ static inline int find_new_ilb(int call_cpu) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; @@ -5572,16 +6563,16 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - for (; sd; sd = sd->parent) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -5589,16 +6580,16 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - for (; sd; sd = sd->parent) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -5653,24 +6644,48 @@ void update_max_interval(void) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; - struct rq *rq = cpu_rq(cpu); + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize; + int need_serialize, need_decay = 0; + u64 max_cost = 0; update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + if (!(sd->flags & SD_LOAD_BALANCE)) continue; + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } + interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; @@ -5689,7 +6704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* - * The LBF_SOME_PINNED logic could have changed + * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ @@ -5704,14 +6719,14 @@ out: next_balance = sd->last_balance + interval; update_next_balance = 1; } - + } + if (need_decay) { /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. */ - if (!continue_balancing) - break; + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); @@ -5729,9 +6744,9 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -5758,7 +6773,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) update_idle_cpu_load(rq); raw_spin_unlock_irq(&rq->lock); - rebalance_domains(balance_cpu, CPU_IDLE); + rebalance_domains(rq, CPU_IDLE); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -5777,12 +6792,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; + struct sched_group_power *sgp; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -5806,22 +6823,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; + if (sd) { + sgp = sd->groups->sgp; + nr_busy = atomic_read(&sgp->nr_busy_cpus); - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) + if (nr_busy > 1) goto need_kick_unlock; - - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; } + + sd = rcu_dereference(per_cpu(sd_asym, cpu)); + + if (sd && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + rcu_read_unlock(); return 0; @@ -5831,7 +6848,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -5840,38 +6857,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); + nohz_idle_balance(this_rq, idle); } -static inline int on_null_domain(int cpu) +static inline int on_null_domain(struct rq *rq) { - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(rq->sd); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); + if (nohz_kick_needed(rq)) + nohz_balancer_kick(); #endif } @@ -6214,7 +7232,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - update_load_set(&se->load, 0); + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing=. Allow PTE scanning to be forced on UMA machines - * for debugging the core machinery. + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) -SCHED_FEAT(NUMA_FORCE, false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) + +/* + * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a + * lower number of hinting faults have been recorded. As this has + * the potential to prevent a task ever migrating to a new node + * due to CPU overload it is disabled by default. + */ +SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..a2740b775b45 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq) * if we should look at the mask. It would be a shame * if we looked at the mask, but the mask was not * updated yet. + * + * Matched by the barrier in pull_rt_task(). */ - wmb(); + smp_wmb(); atomic_inc(&rq->rd->rto_count); } @@ -899,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -908,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -1169,13 +1185,10 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; - int cpu; - - cpu = task_cpu(p); if (p->nr_cpus_allowed == 1) goto out; @@ -1213,8 +1226,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->nr_cpus_allowed > 1)) { + curr->prio <= p->prio)) { int target = find_lowest_rq(p); if (target != -1) @@ -1630,6 +1642,12 @@ static int pull_rt_task(struct rq *this_rq) if (likely(!rt_overloaded(this_rq))) return 0; + /* + * Match the barrier from rt_set_overloaded; this guarantees that if we + * see overloaded we must also see the rto_mask bit. + */ + smp_rmb(); + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1720,7 +1738,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && - rt_task(rq->curr) && + (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); @@ -1931,8 +1949,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = sched_rr_timeslice; /* - * Requeue to the end of queue if we (and all of our ancestors) are the - * only element on the queue + * Requeue to the end of queue if we (and all of our ancestors) are not + * the only element on the queue */ for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..c2119fd20f8b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,12 +2,15 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> #include <linux/tick.h> +#include <linux/slab.h> #include "cpupri.h" +#include "cpudeadline.h" #include "cpuacct.h" struct rq; @@ -72,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq); #define NICE_0_SHIFT SCHED_LOAD_SHIFT /* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + +/* * These are the 'tuning knobs' of the scheduler: */ @@ -80,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq); */ #define RUNTIME_INF ((u64)~0ULL) +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + static inline int rt_policy(int policy) { - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; } static inline int task_has_rt_policy(struct task_struct *p) @@ -92,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -107,6 +144,47 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; extern struct mutex sched_domains_mutex; @@ -363,6 +441,42 @@ struct rt_rq { #endif }; +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + unsigned long dl_nr_total; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; +#endif +}; + #ifdef CONFIG_SMP /* @@ -381,6 +495,15 @@ struct root_domain { cpumask_var_t online; /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + + /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ @@ -408,6 +531,10 @@ struct rq { * remote CPUs use both these fields when doing load calculation. */ unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; @@ -427,6 +554,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -476,6 +604,9 @@ struct rq { u64 age_stamp; u64 idle_stamp; u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -552,6 +683,12 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA_BALANCING +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); +#endif /* CONFIG_NUMA_BALANCING */ + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ @@ -593,9 +730,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) return hsd; } +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_busy); +DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_power { atomic_t ref; @@ -605,6 +757,7 @@ struct sched_group_power { */ unsigned int power, power_orig; unsigned long next_update; + int imbalance; /* XXX unrelated to power but shared group state */ /* * Number of busy cpus in this group. */ @@ -719,6 +872,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); task_thread_info(p)->cpu = cpu; + p->wake_cpu = cpu; #endif } @@ -796,8 +950,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } - - static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; @@ -957,6 +1109,7 @@ static const u32 prio_to_wmult[40] = { #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_REPLENISH 8 #define DEQUEUE_SLEEP 1 @@ -974,7 +1127,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); @@ -992,6 +1145,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); @@ -1011,6 +1165,7 @@ struct sched_class { for (class = sched_class_highest; class; class = class->next) extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -1020,7 +1175,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq, int cpu); +extern void trigger_load_balance(struct rq *rq); extern void idle_balance(int this_cpu, struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); @@ -1037,8 +1192,11 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); + +extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); @@ -1046,6 +1204,12 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); + +unsigned long to_ratio(u64 period, u64 runtime); + extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); @@ -1220,6 +1384,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + /* * double_rq_lock - safely lock two runqueues * @@ -1304,8 +1486,10 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; - rq_sched_info_dequeued(task_rq(t), delta); + rq_sched_info_dequeued(rq, delta); } /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(task_rq(t)), delta = 0; + unsigned long long now = rq_clock(rq), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) t->sched_info.last_arrival = now; t->sched_info.pcount++; - rq_sched_info_arrive(task_rq(t), delta); + rq_sched_info_arrive(rq, delta); } /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(task_rq(t)); + t->sched_info.last_queued = rq_clock(rq); } /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(task_rq(t)) - + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; - rq_sched_info_depart(task_rq(t), delta); + rq_sched_info_depart(rq, delta); if (t->state == TASK_RUNNING) - sched_info_queued(t); + sched_info_queued(rq, t); } /* @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { - struct rq *rq = task_rq(prev); - /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle) - sched_info_depart(prev); + sched_info_depart(rq, prev); if (next != rq->idle) - sched_info_arrive(next); + sched_info_arrive(rq, next); } static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) { if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + __sched_info_switch(rq, prev, next); } #else -#define sched_info_queued(t) do { } while (0) +#define sched_info_queued(rq, t) do { } while (0) #define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_dequeued(rq, t) do { } while (0) +#define sched_info_depart(rq, t) do { } while (0) +#define sched_info_arrive(rq, next) do { } while (0) +#define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..fdb6bb0b3356 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &rt_sched_class, + .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/wait.c b/kernel/sched/wait.c index d550920e040c..7d50f794e248 100644 --- a/kernel/wait.c +++ b/kernel/sched/wait.c @@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue); /* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = 1; /* XXX WF_SYNC */ + + if (unlikely(!q)) + return; + + if (unlikely(nr_exclusive != 1)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any * wake-function that tests for the wait-queue being active @@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + wait->private = current; + wait->func = autoremove_wake_function; + + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); + + return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on diff --git a/kernel/signal.c b/kernel/signal.c index ded28b91fa53..940b30ee9a30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER -int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) +int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { int err; diff --git a/kernel/smp.c b/kernel/smp.c index 0564571dcdf7..bd9f94028838 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,9 +15,9 @@ #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, + CSD_FLAG_WAIT = 0x02, }; struct call_function_data { @@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd) static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: @@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static -void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; int ipi; + if (wait) + csd->flags |= CSD_FLAG_WAIT; + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&csd->list, &dst->list); @@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, } put_cpu(); } +EXPORT_SYMBOL_GPL(__smp_call_function_single); /** * smp_call_function_many(): Run a function on a set of other CPUs. @@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; @@ -524,6 +526,11 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } +void __weak smp_announce(void) +{ + printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); +} + /* Called by boot processor to activate the rest. */ void __init smp_init(void) { @@ -540,7 +547,7 @@ void __init smp_init(void) } /* Any cleanup work */ - printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_announce(); smp_cpus_done(setup_max_cpus); } diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..8a1e6e104892 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/export.h> @@ -29,7 +27,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/irq.h> -#include <asm/irq.h> /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -92,7 +89,7 @@ static void wakeup_softirqd(void) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip, unsigned int cnt) +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -100,46 +97,33 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) raw_local_irq_save(flags); /* - * The preempt tracer hooks into add_preempt_count and will break + * The preempt tracer hooks into preempt_count_add and will break * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET * is set and before current->softirq_enabled is cleared. * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += cnt; + __preempt_count_add(cnt); /* * Were softirqs turned off above: */ - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_off(ip); raw_local_irq_restore(flags); if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - add_preempt_count(cnt); - barrier(); -} +EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -void local_bh_disable(void) -{ - __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(local_bh_disable); - static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); - sub_preempt_count(cnt); + preempt_count_sub(cnt); } /* @@ -149,12 +133,13 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { + WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS @@ -169,29 +154,23 @@ static inline void _local_bh_enable_ip(unsigned long ip) * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); + preempt_count_sub(cnt - 1); - if (unlikely(!in_interrupt() && local_softirq_pending())) + if (unlikely(!in_interrupt() && local_softirq_pending())) { + /* + * Run softirq if any pending. And do it in its own stack + * as we may be calling this deep in a task call stack already. + */ do_softirq(); + } - dec_preempt_count(); + preempt_count_dec(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_enable(); #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip(_RET_IP_); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); +EXPORT_SYMBOL(__local_bh_enable_ip); /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, @@ -209,14 +188,48 @@ EXPORT_SYMBOL(local_bh_enable_ip); #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) #define MAX_SOFTIRQ_RESTART 10 +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * When we run softirqs from irq_exit() and thus on the hardirq stack we need + * to keep the lockdep irq context tracking as tight as possible in order to + * not miss-qualify lock contexts and miss possible deadlocks. + */ + +static inline bool lockdep_softirq_start(void) +{ + bool in_hardirq = false; + + if (trace_hardirq_context(current)) { + in_hardirq = true; + trace_hardirq_exit(); + } + + lockdep_softirq_enter(); + + return in_hardirq; +} + +static inline void lockdep_softirq_end(bool in_hardirq) +{ + lockdep_softirq_exit(); + + if (in_hardirq) + trace_hardirq_enter(); +} +#else +static inline bool lockdep_softirq_start(void) { return false; } +static inline void lockdep_softirq_end(bool in_hardirq) { } +#endif + asmlinkage void __do_softirq(void) { - struct softirq_action *h; - __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; - int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; + struct softirq_action *h; + bool in_hardirq; + __u32 pending; + int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -228,8 +241,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_enter(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + in_hardirq = lockdep_softirq_start(); cpu = smp_processor_id(); restart: @@ -256,7 +269,7 @@ restart: " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + preempt_count_set(prev_count); } rcu_bh_qs(cpu); @@ -276,15 +289,13 @@ restart: wakeup_softirqd(); } - lockdep_softirq_exit(); - + lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } -#ifndef __ARCH_HAS_DO_SOFTIRQ - asmlinkage void do_softirq(void) { __u32 pending; @@ -298,20 +309,16 @@ asmlinkage void do_softirq(void) pending = local_softirq_pending(); if (pending) - __do_softirq(); + do_softirq_own_stack(); local_irq_restore(flags); } -#endif - /* * Enter an interrupt context. */ void irq_enter(void) { - int cpu = smp_processor_id(); - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* @@ -319,7 +326,7 @@ void irq_enter(void) * here, as softirq will be serviced on return from interrupt. */ local_bh_disable(); - tick_check_idle(cpu); + tick_check_idle(); _local_bh_enable(); } @@ -329,15 +336,21 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if * it is the irq stack, because it should be near empty - * at this stage. But we have no way to know if the arch - * calls irq_exit() on the irq stack. So call softirq - * in its own stack to prevent from any overrun on top - * of a potentially deep task stack. + * at this stage. */ - do_softirq(); + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. + */ + do_softirq_own_stack(); +#endif } else { wakeup_softirqd(); } @@ -368,13 +381,13 @@ void irq_exit(void) #endif account_irq_exit_time(current); - trace_hardirq_exit(); - sub_preempt_count(HARDIRQ_OFFSET); + preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); rcu_irq_exit(); + trace_hardirq_exit(); /* must be last! */ } /* @@ -618,146 +631,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = *(int *)cp->info; - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = &softirq; - cp->flags = 0; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { - int i; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } @@ -771,6 +655,10 @@ static void run_ksoftirqd(unsigned int cpu) { local_irq_disable(); if (local_softirq_pending()) { + /* + * We can safely run softirq on inline stack, as we are not deep + * in the task stack here. + */ __do_softirq(); rcu_note_context_switch(cpu); local_irq_enable(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,6 +20,7 @@ #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> +#include <linux/lglock.h> /* * Structure to determine completion condition and record errors. May @@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; +/* + * Avoids a race between stop_two_cpus and global stop_cpus, where + * the stoppers could get queued up in reverse order, leading to + * system deadlock. Using an lglock means stop_two_cpus remains + * relatively cheap. + */ +DEFINE_STATIC_LGLOCK(stop_cpus_lock); + static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); @@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) return done.executed ? done.ret : -ENOENT; } +/* This controls the threads on each CPU. */ +enum multi_stop_state { + /* Dummy starting state for thread. */ + MULTI_STOP_NONE, + /* Awaiting everyone to be scheduled. */ + MULTI_STOP_PREPARE, + /* Disable interrupts. */ + MULTI_STOP_DISABLE_IRQ, + /* Run the function */ + MULTI_STOP_RUN, + /* Exit */ + MULTI_STOP_EXIT, +}; + +struct multi_stop_data { + int (*fn)(void *); + void *data; + /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ + unsigned int num_threads; + const struct cpumask *active_cpus; + + enum multi_stop_state state; + atomic_t thread_ack; +}; + +static void set_state(struct multi_stop_data *msdata, + enum multi_stop_state newstate) +{ + /* Reset ack counter. */ + atomic_set(&msdata->thread_ack, msdata->num_threads); + smp_wmb(); + msdata->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct multi_stop_data *msdata) +{ + if (atomic_dec_and_test(&msdata->thread_ack)) + set_state(msdata, msdata->state + 1); +} + +/* This is the cpu_stop function which stops the CPU. */ +static int multi_cpu_stop(void *data) +{ + struct multi_stop_data *msdata = data; + enum multi_stop_state curstate = MULTI_STOP_NONE; + int cpu = smp_processor_id(), err = 0; + unsigned long flags; + bool is_active; + + /* + * When called from stop_machine_from_inactive_cpu(), irq might + * already be disabled. Save the state and restore it on exit. + */ + local_save_flags(flags); + + if (!msdata->active_cpus) + is_active = cpu == cpumask_first(cpu_online_mask); + else + is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + + /* Simple state machine */ + do { + /* Chill out and ensure we re-read multi_stop_state. */ + cpu_relax(); + if (msdata->state != curstate) { + curstate = msdata->state; + switch (curstate) { + case MULTI_STOP_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case MULTI_STOP_RUN: + if (is_active) + err = msdata->fn(msdata->data); + break; + default: + break; + } + ack_state(msdata); + } + } while (curstate != MULTI_STOP_EXIT); + + local_irq_restore(flags); + return err; +} + +struct irq_cpu_stop_queue_work_info { + int cpu1; + int cpu2; + struct cpu_stop_work *work1; + struct cpu_stop_work *work2; +}; + +/* + * This function is always run with irqs and preemption disabled. + * This guarantees that both work1 and work2 get queued, before + * our local migrate thread gets the chance to preempt us. + */ +static void irq_cpu_stop_queue_work(void *arg) +{ + struct irq_cpu_stop_queue_work_info *info = arg; + cpu_stop_queue_work(info->cpu1, info->work1); + cpu_stop_queue_work(info->cpu2, info->work2); +} + +/** + * stop_two_cpus - stops two cpus + * @cpu1: the cpu to stop + * @cpu2: the other cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Stops both the current and specified CPU and runs @fn on one of them. + * + * returns when both are completed. + */ +int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_done done; + struct cpu_stop_work work1, work2; + struct irq_cpu_stop_queue_work_info call_args; + struct multi_stop_data msdata; + + preempt_disable(); + msdata = (struct multi_stop_data){ + .fn = fn, + .data = arg, + .num_threads = 2, + .active_cpus = cpumask_of(cpu1), + }; + + work1 = work2 = (struct cpu_stop_work){ + .fn = multi_cpu_stop, + .arg = &msdata, + .done = &done + }; + + call_args = (struct irq_cpu_stop_queue_work_info){ + .cpu1 = cpu1, + .cpu2 = cpu2, + .work1 = &work1, + .work2 = &work2, + }; + + cpu_stop_init_done(&done, 2); + set_state(&msdata, MULTI_STOP_PREPARE); + + /* + * If we observe both CPUs active we know _cpu_down() cannot yet have + * queued its stop_machine works and therefore ours will get executed + * first. Or its not either one of our CPUs that's getting unplugged, + * in which case we don't care. + * + * This relies on the stopper workqueues to be FIFO. + */ + if (!cpu_active(cpu1) || !cpu_active(cpu2)) { + preempt_enable(); + return -ENOENT; + } + + lg_local_lock(&stop_cpus_lock); + /* + * Queuing needs to be done by the lowest numbered CPU, to ensure + * that works are always queued in the same order on every CPU. + * This prevents deadlocks. + */ + smp_call_function_single(min(cpu1, cpu2), + &irq_cpu_stop_queue_work, + &call_args, 0); + lg_local_unlock(&stop_cpus_lock); + preempt_enable(); + + wait_for_completion(&done.completion); + + return done.executed ? done.ret : -ENOENT; +} + /** * stop_one_cpu_nowait - stop a cpu but don't wait for completion * @cpu: cpu to stop @@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - preempt_disable(); + lg_global_lock(&stop_cpus_lock); for_each_cpu(cpu, cpumask) cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); - preempt_enable(); + lg_global_unlock(&stop_cpus_lock); } static int __stop_cpus(const struct cpumask *cpumask, @@ -359,98 +546,14 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -/* This controls the threads on each CPU. */ -enum stopmachine_state { - /* Dummy starting state for thread. */ - STOPMACHINE_NONE, - /* Awaiting everyone to be scheduled. */ - STOPMACHINE_PREPARE, - /* Disable interrupts. */ - STOPMACHINE_DISABLE_IRQ, - /* Run the function */ - STOPMACHINE_RUN, - /* Exit */ - STOPMACHINE_EXIT, -}; - -struct stop_machine_data { - int (*fn)(void *); - void *data; - /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ - unsigned int num_threads; - const struct cpumask *active_cpus; - - enum stopmachine_state state; - atomic_t thread_ack; -}; - -static void set_state(struct stop_machine_data *smdata, - enum stopmachine_state newstate) -{ - /* Reset ack counter. */ - atomic_set(&smdata->thread_ack, smdata->num_threads); - smp_wmb(); - smdata->state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(struct stop_machine_data *smdata) -{ - if (atomic_dec_and_test(&smdata->thread_ack)) - set_state(smdata, smdata->state + 1); -} - -/* This is the cpu_stop function which stops the CPU. */ -static int stop_machine_cpu_stop(void *data) -{ - struct stop_machine_data *smdata = data; - enum stopmachine_state curstate = STOPMACHINE_NONE; - int cpu = smp_processor_id(), err = 0; - unsigned long flags; - bool is_active; - - /* - * When called from stop_machine_from_inactive_cpu(), irq might - * already be disabled. Save the state and restore it on exit. - */ - local_save_flags(flags); - - if (!smdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, smdata->active_cpus); - - /* Simple state machine */ - do { - /* Chill out and ensure we re-read stopmachine_state. */ - cpu_relax(); - if (smdata->state != curstate) { - curstate = smdata->state; - switch (curstate) { - case STOPMACHINE_DISABLE_IRQ: - local_irq_disable(); - hard_irq_disable(); - break; - case STOPMACHINE_RUN: - if (is_active) - err = smdata->fn(smdata->data); - break; - default: - break; - } - ack_state(smdata); - } - } while (curstate != STOPMACHINE_EXIT); - - local_irq_restore(flags); - return err; -} - int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct stop_machine_data smdata = { .fn = fn, .data = data, - .num_threads = num_online_cpus(), - .active_cpus = cpus }; + struct multi_stop_data msdata = { + .fn = fn, + .data = data, + .num_threads = num_online_cpus(), + .active_cpus = cpus, + }; if (!stop_machine_initialized) { /* @@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) unsigned long flags; int ret; - WARN_ON_ONCE(smdata.num_threads != 1); + WARN_ON_ONCE(msdata.num_threads != 1); local_irq_save(flags); hard_irq_disable(); @@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) } /* Set the initial state and stop all online cpus. */ - set_state(&smdata, STOPMACHINE_PREPARE); - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); + set_state(&msdata, MULTI_STOP_PREPARE); + return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine); int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct stop_machine_data smdata = { .fn = fn, .data = data, + struct multi_stop_data msdata = { .fn = fn, .data = data, .active_cpus = cpus }; struct cpu_stop_done done; int ret; /* Local CPU must be inactive and CPU hotplug in progress. */ BUG_ON(cpu_active(raw_smp_processor_id())); - smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ + msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ /* No proper task established and can't sleep - busy wait for lock. */ while (!mutex_trylock(&stop_cpus_mutex)) cpu_relax(); /* Schedule work on other CPUs and execute directly for local CPU */ - set_state(&smdata, STOPMACHINE_PREPARE); + set_state(&msdata, MULTI_STOP_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); - queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, + queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, &done); - ret = stop_machine_cpu_stop(&smdata); + ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ while (!completion_done(&done.completion)) diff --git a/kernel/sys.c b/kernel/sys.c index c18ecca575b4..c72311324ea7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,7 +16,6 @@ #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> -#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..332cefcdb04b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,8 +95,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -190,7 +188,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; +static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -371,13 +369,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "numa_balancing_scan_period_reset", - .data = &sysctl_numa_balancing_scan_period_reset, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, .maxlen = sizeof(unsigned int), @@ -391,6 +382,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_migrate_deferred", + .data = &sysctl_numa_balancing_migrate_deferred, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { @@ -962,9 +960,10 @@ static struct ctl_table kern_table[] = { { .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "hung_task_timeout_secs", @@ -1049,6 +1048,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, + .extra1 = &one, }, { .procname = "perf_cpu_time_max_percent", @@ -1119,7 +1119,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", @@ -2214,8 +2221,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) + if (!first) { err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + } err = proc_put_long(&buffer, &left, val, false); if (err) break; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b609213ca9a2..653cbbd9e7ad 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += snprintf(str, end - str, "%lu\t", value); + str += scnprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += snprintf(str, end - str, "%lu\t", value); + str += scnprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file, if (get_user(dnaddr, (__le16 __user *)newval)) goto out; - len = snprintf(buf, sizeof(buf), "%hu.%hu", + len = scnprintf(buf, sizeof(buf), "%hu.%hu", le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 000000000000..3e9868d47535 --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,20 @@ +#include <linux/export.h> +#include <linux/init.h> + + __INITRODATA + + .align 8 + .globl VMLINUX_SYMBOL(system_certificate_list) +VMLINUX_SYMBOL(system_certificate_list): +__cert_list_start: + .incbin "kernel/x509_certificate_list" +__cert_list_end: + + .align 8 + .globl VMLINUX_SYMBOL(system_certificate_list_size) +VMLINUX_SYMBOL(system_certificate_list_size): +#ifdef CONFIG_64BIT + .quad __cert_list_end - __cert_list_start +#else + .long __cert_list_end - __cert_list_start +#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 000000000000..52ebc70263f4 --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,105 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const unsigned long system_certificate_list_size; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ + pr_notice("Initialise system trusted keyring\n"); + + system_trusted_keyring = + keyring_alloc(".system_keyring", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(system_trusted_keyring)) + panic("Can't allocate system trusted keyring\n"); + + set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading compiled-in X.509 certificates\n"); + + p = system_certificate_list; + end = p + system_certificate_list_size; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), + "asymmetric", + NULL, + p, + plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_TRUSTED); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_system_certificate_list); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 145bb4d3bd4d..13d2f7cd65db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) struct listener_list *listeners; struct listener *s, *tmp, *s2; unsigned int cpu; + int ret = 0; if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; @@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); - if (!s) + if (!s) { + ret = -ENOMEM; goto cleanup; - + } s->pid = pid; s->valid = 1; @@ -339,7 +341,7 @@ cleanup: } up_write(&listeners->sem); } - return 0; + return ret; } static int parse(struct nlattr *na, struct cpumask *mask) @@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) if (!na) goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) + if (nla_put(skb, type, sizeof(pid), &pid) < 0) { + nla_nest_cancel(skb, na); goto err; + } ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); - if (!ret) + if (!ret) { + nla_nest_cancel(skb, na); goto err; + } nla_nest_end(skb, na); return nla_data(ret); @@ -667,17 +673,18 @@ err: nlmsg_free(rep_skb); } -static struct genl_ops taskstats_ops = { - .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, -}; - -static struct genl_ops cgroupstats_ops = { - .cmd = CGROUPSTATS_CMD_GET, - .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, +static const struct genl_ops taskstats_ops[] = { + { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = CGROUPSTATS_CMD_GET, + .doit = cgroupstats_user_cmd, + .policy = cgroupstats_cmd_get_policy, + }, }; /* Needed early in initialization */ @@ -696,26 +703,13 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family(&family); + rc = genl_register_family_with_ops(&family, taskstats_ops); if (rc) return rc; - rc = genl_register_ops(&family, &taskstats_ops); - if (rc < 0) - goto err; - - rc = genl_register_ops(&family, &cgroupstats_ops); - if (rc < 0) - goto err_cgroup_ops; - family_registered = 1; pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; -err_cgroup_ops: - genl_unregister_ops(&family, &taskstats_ops); -err: - genl_unregister_family(&family); - return rc; } /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9ec..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -100,7 +100,7 @@ config NO_HZ_FULL # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency - depends on 64BIT + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; return hrtimer_get_res(baseid, tp); } @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; *tp = ktime_to_timespec(base->gettime()); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 662c5798a685..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, const char *buf, size_t count) { char name[CS_NAME_LEN]; - size_t ret = sysfs_get_uname(buf, name, count); + ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce; if (ret < 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource - * + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters */ -static u64 clocksource_max_deferment(struct clocksource *cs) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) * which is equivalent to the below. - * max_cycles < (2^63)/(cs->mult + cs->maxadj) - * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) - * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) - * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) - * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) + * max_cycles < (2^63)/(mult + maxadj) + * max_cycles < 2^(log2((2^63)/(mult + maxadj))) + * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) + * max_cycles < 2^(63 - log2(mult + maxadj)) + * max_cycles < 1 << (63 - log2(mult + maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); + max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. + * determined by the minimum of max_cycles and mask. * Note: Here we subtract the maxadj to make sure we don't sleep for * too long if there's a large negative adjustment. */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, - cs->shift); + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs; + max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, + cs->mask); /* * To ensure that the clocksource does not wrap whilst we are idle, * limit the time the clocksource can be deferred by 12.5%. Please @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret; + ssize_t ret; mutex_lock(&clocksource_mutex); @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, { struct clocksource *cs; char name[CS_NAME_LEN]; - size_t ret; + ssize_t ret; ret = sysfs_get_uname(buf, name, count); if (ret < 0) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb2215174f05..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... + * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { struct timespec adjust = now; fail = -ENODEV; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22bb..0abb36464281 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,25 +8,28 @@ #include <linux/clocksource.h> #include <linux/init.h> #include <linux/jiffies.h> +#include <linux/ktime.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/syscore_ops.h> -#include <linux/timer.h> +#include <linux/hrtimer.h> #include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> struct clock_data { + ktime_t wrap_kt; u64 epoch_ns; - u32 epoch_cyc; - u32 epoch_cyc_copy; + u64 epoch_cyc; + seqcount_t seq; unsigned long rate; u32 mult; u32 shift; bool suspended; }; -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); @@ -35,42 +38,46 @@ static struct clock_data cd = { .mult = NSEC_PER_SEC / HZ, }; -static u32 __read_mostly sched_clock_mask = 0xffffffff; +static u64 __read_mostly sched_clock_mask; -static u32 notrace jiffy_sched_clock_read(void) +static u64 notrace jiffy_sched_clock_read(void) { - return (u32)(jiffies - INITIAL_JIFFIES); + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static u32 __read_mostly (*read_sched_clock_32)(void); + +static u64 notrace read_sched_clock_32_wrapper(void) +{ + return read_sched_clock_32(); +} + +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } -static unsigned long long notrace sched_clock_32(void) +unsigned long long notrace sched_clock(void) { u64 epoch_ns; - u32 epoch_cyc; - u32 cyc; + u64 epoch_cyc; + u64 cyc; + unsigned long seq; if (cd.suspended) return cd.epoch_ns; - /* - * Load the epoch_cyc and epoch_ns atomically. We do this by - * ensuring that we always write epoch_cyc, epoch_ns and - * epoch_cyc_copy in strict order, and read them in strict order. - * If epoch_cyc and epoch_cyc_copy are not equal, then we're in - * the middle of an update, and we should repeat the load. - */ do { + seq = raw_read_seqcount_begin(&cd.seq); epoch_cyc = cd.epoch_cyc; - smp_rmb(); epoch_ns = cd.epoch_ns; - smp_rmb(); - } while (epoch_cyc != cd.epoch_cyc_copy); + } while (read_seqcount_retry(&cd.seq, seq)); cyc = read_sched_clock(); cyc = (cyc - epoch_cyc) & sched_clock_mask; @@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void) static void notrace update_sched_clock(void) { unsigned long flags; - u32 cyc; + u64 cyc; u64 ns; cyc = read_sched_clock(); ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift); - /* - * Write epoch_cyc and epoch_ns in a way that the update is - * detectable in cyc_to_fixed_sched_clock(). - */ + raw_local_irq_save(flags); - cd.epoch_cyc_copy = cyc; - smp_wmb(); + raw_write_seqcount_begin(&cd.seq); cd.epoch_ns = ns; - smp_wmb(); cd.epoch_cyc = cyc; + raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } -static void sched_clock_poll(unsigned long wrap_ticks) +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { - mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +void __init sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate) { - unsigned long r, w; + unsigned long r; u64 res, wrap; char r_unit; if (cd.rate > rate) return; - BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = (1ULL << bits) - 1; + sched_clock_mask = CLOCKSOURCE_MASK(bits); cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); r = rate; if (r >= 4000000) { @@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) r_unit = ' '; /* calculate how many ns until we wrap */ - wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); - do_div(wrap, NSEC_PER_MSEC); - w = wrap; + wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); /* calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", - bits, r, r_unit, res, w); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); update_sched_clock(); /* @@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) { - return sched_clock_func(); + read_sched_clock_32 = read; + sched_clock_register(read_sched_clock_32_wrapper, bits, rate); } void __init sched_clock_postinit(void) @@ -180,14 +177,22 @@ void __init sched_clock_postinit(void) * make it the final one one. */ if (read_sched_clock == jiffy_sched_clock_read) - setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); - sched_clock_poll(sched_clock_timer.data); + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } static int sched_clock_suspend(void) { - sched_clock_poll(sched_clock_timer.data); + sched_clock_poll(&sched_clock_timer); cd.suspended = true; return 0; } @@ -195,7 +200,6 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); - cd.epoch_cyc_copy = cd.epoch_cyc; cd.suspended = false; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..43780ab5e279 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; @@ -537,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ -void tick_check_oneshot_broadcast(int cpu) +void tick_check_oneshot_broadcast_this_cpu(void) { - if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = &__get_cpu_var(tick_cpu_device); /* * We might be in the middle of switching over from diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0e..20b2fe37d105 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* @@ -70,6 +85,7 @@ static void tick_periodic(int cpu) do_timer(1); write_sequnlock(&jiffies_lock); + update_wall_time(); } update_process_times(user_mode(get_irq_regs())); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..8329669b51ec 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); -extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* * NO_HZ / high resolution timer shared code @@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast(int cpu); +extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) @@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ @@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f834..08cb0c3b8ccb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now) tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&jiffies_lock); + update_wall_time(); } /* @@ -177,7 +178,7 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) { + if (!sched_clock_stable()) { trace_tick_stop(0, "unstable sched clock\n"); /* * Don't allow the user to think they can get @@ -361,8 +362,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz); */ static void tick_nohz_update_jiffies(ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ts->idle_waketime = now; + __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); @@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - update_ts_time_stats(cpu, ts, now, NULL); + update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { ktime_t now = ktime_get(); @@ -465,7 +462,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +503,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -711,8 +708,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; @@ -752,7 +751,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu, ts); + now = tick_nohz_start_idle(ts); if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; @@ -799,11 +798,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -914,8 +908,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -928,7 +921,7 @@ void tick_nohz_idle_exit(void) now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); @@ -973,7 +966,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +974,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1012,12 +1005,10 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ - - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta; /* @@ -1032,36 +1023,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now) #endif } -static inline void tick_check_nohz(int cpu) +static inline void tick_check_nohz_this_cpu(void) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(cpu, now); + tick_nohz_kick_tick(ts, now); } } #else static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_check_nohz_this_cpu(void) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ -void tick_check_idle(int cpu) +void tick_check_idle(void) { - tick_check_oneshot_broadcast(cpu); - tick_check_nohz(cpu); + tick_check_oneshot_broadcast_this_cpu(); + tick_check_nohz_this_cpu(); } /* @@ -1139,8 +1130,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 947ba25a95a0..0aa4ce81bc16 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) } /** - * timekeeper_setup_internals - Set up internals to use clocksource clock. + * tk_setup_internals - Set up internals to use clocksource clock. * + * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment @@ -595,7 +596,7 @@ s32 timekeeping_get_tai_offset(void) static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /** @@ -610,6 +611,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); @@ -1023,6 +1025,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } + + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1130,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * we can adjust by 1. */ error >>= 2; - /* - * XXX - In update_wall_time, we round up to the next - * nanosecond, and store the amount rounded up into - * the error. This causes the likely below to be unlikely. - * - * The proper fix is to avoid rounding up by using - * the high precision tk->xtime_nsec instead of - * xtime.tv_nsec everywhere. Fixing this will take some - * time. - */ if (likely(error <= interval)) adj = 1; else @@ -1255,7 +1249,7 @@ out_adjust: static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; - unsigned int action = 0; + unsigned int clock_set = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1277,11 +1271,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); - clock_was_set_delayed(); - action = TK_CLOCK_WAS_SET; + clock_set = TK_CLOCK_WAS_SET; } } - return action; + return clock_set; } /** @@ -1294,7 +1287,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * Returns the unconsumed cycles. */ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, - u32 shift) + u32 shift, + unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; @@ -1308,7 +1302,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; - accumulate_nsecs_to_secs(tk); + *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ raw_nsecs = (u64)tk->raw_interval << shift; @@ -1347,7 +1341,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) @@ -1359,14 +1353,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * update_wall_time - Uses the current clocksource to increment the wall time * */ -static void update_wall_time(void) +void update_wall_time(void) { struct clocksource *clock; struct timekeeper *real_tk = &timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; - unsigned int action; + unsigned int clock_set = 0; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1401,7 +1395,8 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift); + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -1419,7 +1414,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - action = accumulate_nsecs_to_secs(tk); + clock_set |= accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1435,10 +1430,12 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, action); + timekeeping_update(real_tk, clock_set); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + clock_was_set(); } /** @@ -1583,7 +1580,6 @@ struct timespec get_monotonic_coarse(void) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_wall_time(); calc_global_load(ticks); } @@ -1613,9 +1609,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * ktime_get_update_offsets - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets - * Called from hrtimer_interupt() or retrigger_next_event() + * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) @@ -1697,12 +1694,14 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); - update_pvclock_gtod(tk, true); - clock_was_set_delayed(); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (tai != orig_tai) + clock_was_set(); + ntp_notify_cmos_timer(); return ret; @@ -1738,4 +1737,5 @@ void xtime_update(unsigned long ticks) write_seqlock(&jiffies_lock); do_timer(ticks); write_sequnlock(&jiffies_lock); + update_wall_time(); } diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.2\n"); + seq_puts(m, "Timer Stats Version: v0.3\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); + seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); + seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..accfd241b9e5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int preempt_count = preempt_count(); + int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (preempt_count != preempt_count()) { + if (count != preempt_count()) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); + fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count() = preempt_count; + preempt_count_set(count); } } @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d7e2068e4b71..1378e84fbe39 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM_RUNTIME),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560bfb95..f785aef65799 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03cf44ac54d3..cd7f76d1eb86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -85,6 +85,8 @@ int function_trace_stop __read_mostly; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -278,6 +280,29 @@ static void update_global_ops(void) global_ops.func = func; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + static void update_ftrace_function(void) { ftrace_func_t func; @@ -296,16 +321,61 @@ static void update_ftrace_function(void) !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) - function_trace_op = ftrace_global_list; + set_function_trace_op = ftrace_global_list; else - function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ - function_trace_op = &ftrace_list_end; + set_function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; } + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + update_function_graph_func(); + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coorditate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + schedule_on_each_cpu(ftrace_sync); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + ftrace_trace_function = func; } @@ -367,9 +437,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (unlikely(ftrace_disabled)) - return -ENODEV; - if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -413,24 +480,10 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - if (ftrace_disabled) - return -ENODEV; - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; @@ -445,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - if (!ret) { - /* - * The ftrace_ops is now removed from the list, - * so there'll be no new users. We must ensure - * all current users are done before we free - * the control data. - * Note synchronize_sched() is not enough, as we - * use preempt_disable() to do RCU, but the function - * tracer can be called where RCU is not active - * (before user_exit()). - */ - schedule_on_each_cpu(ftrace_sync); - control_ops_free(ops); - } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -468,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - schedule_on_each_cpu(ftrace_sync); - - return 0; } @@ -781,7 +809,7 @@ static int ftrace_profile_init(void) int cpu; int ret = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { ret = ftrace_profile_init_cpu(cpu); if (ret) break; @@ -1088,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1998,8 +2013,14 @@ void ftrace_modify_all_code(int command) else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (update && ftrace_trace_function != ftrace_ops_list_func) + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); ftrace_update_ftrace_func(ftrace_trace_function); + } if (command & FTRACE_START_FUNC_RET) ftrace_enable_ftrace_graph_caller(); @@ -2088,10 +2109,15 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; + ret = __register_ftrace_function(ops); + if (ret) + return ret; + ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; @@ -2113,12 +2139,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command) { bool hash_disable = true; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; ftrace_start_up--; /* @@ -2152,10 +2183,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) - return; + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + return 0; + } ftrace_run_update_code(command); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + + return 0; } static void ftrace_startup_sysctl(void) @@ -2734,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_filter_lseek() should be used as the lseek routine, and + * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -3060,16 +3123,13 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { - int ret; int i; if (!ftrace_probe_registered) @@ -3082,9 +3142,7 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } @@ -3307,7 +3365,11 @@ void unregister_ftrace_function_probe_all(char *glob) static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; int ret = 0; @@ -3326,7 +3388,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd) return ret; } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; int ret = -ENODEV; @@ -3641,7 +3707,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) { @@ -3659,7 +3725,7 @@ static void __init set_ftrace_early_graph(char *buf) func = strsep(&buf, ","); /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - func); + FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3759,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3767,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3776,15 +3842,25 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { + unsigned long *table; + size_t size; + int *count; + const struct seq_operations *seq_ops; +}; static void * __g_next(struct seq_file *m, loff_t *pos) { - if (*pos >= ftrace_graph_count) + struct ftrace_graph_data *fgd = m->private; + + if (*pos >= *fgd->count) return NULL; - return &ftrace_graph_funcs[*pos]; + return &fgd->table[*pos]; } static void * @@ -3796,10 +3872,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos) static void *g_start(struct seq_file *m, loff_t *pos) { + struct ftrace_graph_data *fgd = m->private; + mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_filter_enabled && !*pos) + if (!*fgd->count && !*pos) return (void *)1; return __g_next(m, pos); @@ -3835,38 +3913,88 @@ static const struct seq_operations ftrace_graph_seq_ops = { }; static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) { int ret = 0; - if (unlikely(ftrace_disabled)) - return -ENODEV; - mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ftrace_graph_filter_enabled = 0; - ftrace_graph_count = 0; - memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + *fgd->count = 0; + memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); } mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_graph_seq_ops); + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, fgd->seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } + } else + file->private_data = fgd; return ret; } static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_notrace_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_notrace_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int ftrace_graph_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + kfree(m->private); seq_release(inode, file); + } else { + kfree(file->private_data); + } + return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -3879,7 +4007,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + if (!not && *idx >= size) return -EBUSY; search_len = strlen(search); @@ -3907,7 +4035,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) fail = 0; if (!exists) { array[(*idx)++] = rec->ip; - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + if (*idx >= size) goto out; } } else { @@ -3925,8 +4053,6 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = !!(*idx); - return 0; } @@ -3935,36 +4061,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - ssize_t read, ret; + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; if (!cnt) return 0; - mutex_lock(&graph_lock); - - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { - ret = -ENOMEM; - goto out_unlock; - } + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) + return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; + mutex_lock(&graph_lock); + /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - parser.buffer); - if (ret) - goto out_free; + ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, + parser.buffer); + + mutex_unlock(&graph_lock); } - ret = read; + if (!ret) + ret = read; -out_free: trace_parser_put(&parser); -out_unlock: - mutex_unlock(&graph_lock); return ret; } @@ -3973,7 +4096,15 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -3997,6 +4128,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); + trace_create_file("set_graph_notrace", 0444, d_tracer, + NULL, + &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -4290,12 +4424,15 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ }) -# define ftrace_shutdown(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) + # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4320,12 +4457,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); + + /* + * Control funcs (perf) uses RCU. Only trace if + * RCU is currently active. + */ + if (!rcu_is_watching()) + goto out; + do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); + out: trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4631,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_pid_release, }; @@ -4695,9 +4841,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); + ret = ftrace_startup(ops, 0); mutex_unlock(&ftrace_lock); @@ -4716,9 +4860,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); + ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4778,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -4912,6 +5055,37 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } +/* Just a place holder for function graph */ +static struct ftrace_ops fgraph_ops __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | + FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list == &global_ops && + global_ops.next == &ftrace_list_end)) + ftrace_graph_entry = __ftrace_graph_entry; + else + ftrace_graph_entry = ftrace_graph_entry_test; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -4936,9 +5110,18 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, } ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -4955,7 +5138,8 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..294b8a271a04 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2558,7 +2558,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (unlikely(test_time_stamp(delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; + local_clock_stable = sched_clock_stable(); #endif WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7974ba20557d..20c755e018ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) +int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) { - return filter_check_discard(call, rec, buffer, event); + if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + !filter_match_preds(file->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } -EXPORT_SYMBOL_GPL(filter_current_check_discard); +EXPORT_SYMBOL_GPL(call_filter_check_discard); cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -575,6 +595,28 @@ void free_snapshot(struct trace_array *tr) } /** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to trace_snapshot(), but it will allocate the @@ -587,11 +629,10 @@ void free_snapshot(struct trace_array *tr) */ void tracing_snapshot_alloc(void) { - struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); - if (WARN_ON(ret < 0)) + ret = tracing_alloc_snapshot(); + if (ret < 0) return; tracing_snapshot(); @@ -603,6 +644,12 @@ void tracing_snapshot(void) WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ @@ -843,9 +890,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, if (isspace(ch)) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else { + } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + } else { + ret = -EINVAL; + goto out; } *ppos += read; @@ -1261,21 +1311,6 @@ int is_tracing_stopped(void) } /** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); -} - -/** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, @@ -1509,7 +1544,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1630,7 +1666,7 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -1714,7 +1750,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: @@ -1816,7 +1852,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: @@ -2008,7 +2044,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2063,7 +2099,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2760,7 +2796,7 @@ static void show_snapshot_main_help(struct seq_file *m) seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); seq_printf(m, "# is not a '0' or '1')\n"); } @@ -2964,6 +3000,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -3142,19 +3183,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { + int ret; + if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else - return 0; + file->f_pos = ret = 0; + + return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_release, }; @@ -4198,12 +4243,6 @@ out: return sret; } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { @@ -4215,7 +4254,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = { .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, + .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -4899,7 +4938,7 @@ static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_snapshot_release, }; @@ -5454,12 +5493,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { .func = ftrace_trace_snapshot_callback, }; -static int register_snapshot_cmd(void) +static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else -static inline int register_snapshot_cmd(void) { return 0; } +static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) @@ -5869,6 +5908,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + buf->tr = tr; + buf->buffer = ring_buffer_alloc(size, rb_flags); if (!buf->buffer) return -ENOMEM; @@ -6253,6 +6294,17 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->trace_buffer = &global_trace.trace_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c86fb7a2b4..02b592f2d4b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ + #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -124,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, }; #define TRACE_BUF_SIZE 1024 @@ -192,8 +194,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); - DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); + struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -514,6 +516,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -585,6 +588,8 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); + extern cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ @@ -711,6 +716,8 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -730,15 +737,16 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_filter_enabled) + if (!ftrace_graph_count) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -758,11 +766,31 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_notrace_count) + return 0; + + for (i = 0; i < ftrace_graph_notrace_count; i++) { + if (addr == ftrace_graph_notrace_funcs[i]) + return 1; + } + + return 0; +} #else static inline int ftrace_graph_addr(unsigned long addr) { return 1; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t @@ -986,40 +1014,216 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file, char *filter_string); extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} - extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); + +static inline void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + extern struct mutex event_mutex; extern struct list_head ftrace_events; +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { + unsigned long count; + int ref; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + struct list_head list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*func)(struct event_trigger_data *data); + int (*init)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + void (*free)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + bool post_trigger; + int (*func)(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *params); + int (*reg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct ftrace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d594da0dc03c..697fb9bac8f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..e854f420e033 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,9 +24,15 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; /* No tracing, just counting, so no obvious leak */ @@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 368a4d50cc30..e71ffd4eccb5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -342,6 +342,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } +int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) +{ + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + static int ftrace_event_enable_disable(struct ftrace_event_file *file, int enable) { @@ -421,11 +427,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) } } -static void *event_file_data(struct file *filp) -{ - return ACCESS_ONCE(file_inode(filp)->i_private); -} - static void remove_event_file_dir(struct ftrace_event_file *file) { struct dentry *dir = file->dir; @@ -989,7 +990,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; struct trace_seq *s; int r = -ENODEV; @@ -1004,12 +1005,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - print_event_filter(call, s); + file = event_file_data(filp); + if (file) + print_event_filter(file, s); mutex_unlock(&event_mutex); - if (call) + if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -1021,7 +1022,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; char *buf; int err = -ENODEV; @@ -1039,9 +1040,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, buf[cnt] = '\0'; mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - err = apply_event_filter(call, buf); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); free_page((unsigned long) buf); @@ -1062,6 +1063,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct trace_array *tr; int ret; + if (tracing_is_disabled()) + return -ENODEV; + /* Make sure the system still exists */ mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); @@ -1108,6 +1112,9 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (tracing_is_disabled()) + return -ENODEV; + if (trace_array_get(tr) < 0) return -ENODEV; @@ -1124,11 +1131,12 @@ static int system_tr_open(struct inode *inode, struct file *filp) if (ret < 0) { trace_array_put(tr); kfree(dir); + return ret; } filp->private_data = dir; - return ret; + return 0; } static int subsystem_release(struct inode *inode, struct file *file) @@ -1539,9 +1547,12 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, call, + trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); @@ -1577,6 +1588,7 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); + destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1637,6 +1649,8 @@ trace_create_new_event(struct ftrace_event_call *call, file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); return file; @@ -1700,7 +1714,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_preds(call); + destroy_call_preds(call); } static int probe_remove_event_call(struct ftrace_event_call *call) @@ -1841,20 +1855,7 @@ __trace_add_event_dirs(struct trace_array *tr) } } -#ifdef CONFIG_DYNAMIC_FTRACE - -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct event_probe_data { - struct ftrace_event_file *file; - unsigned long count; - int ref; - bool enable; -}; - -static struct ftrace_event_file * +struct ftrace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; @@ -1877,6 +1878,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + static void event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) { @@ -2303,9 +2317,15 @@ int event_trace_del_tracer(struct trace_array *tr) { mutex_lock(&event_mutex); + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); @@ -2366,6 +2386,8 @@ static __init int event_trace_enable(void) register_event_cmds(); + register_trigger_cmds(); + return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97daa8cf958d..8a8631926a07 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } +static inline struct event_filter *event_filter(struct ftrace_event_file *file) +{ + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + return file->event_call->filter; + else + return file->filter; +} + /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter = event_filter(file); if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); @@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_call *call) +static void call_filter_disable(struct ftrace_event_call *call) { call->flags &= ~TRACE_EVENT_FL_FILTERED; } +static void filter_disable(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call_filter_disable(call); + else + file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + static void __free_filter(struct event_filter *filter) { if (!filter) @@ -781,16 +799,35 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + +void destroy_call_preds(struct ftrace_event_call *call) +{ + __free_filter(call->filter); + call->filter = NULL; +} + +static void destroy_file_preds(struct ftrace_event_file *file) +{ + __free_filter(file->filter); + file->filter = NULL; +} + /* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing * the tracepoints from within it. */ -void destroy_preds(struct ftrace_event_call *call) +void destroy_preds(struct ftrace_event_file *file) { - __free_filter(call->filter); - call->filter = NULL; + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + destroy_call_preds(file->event_call); + else + destroy_file_preds(file); } static struct event_filter *__alloc_filter(void) @@ -825,28 +862,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static inline void __remove_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + filter_disable(file); + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + remove_filter_string(call->filter); + else + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - filter_disable(call); - remove_filter_string(call->filter); + __remove_filter(file); } } -static void filter_free_subsystem_filters(struct event_subsystem *system) +static inline void __free_subsystem_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { + __free_filter(call->filter); + call->filter = NULL; + } else { + __free_filter(file->filter); + file->filter = NULL; + } +} + +static void filter_free_subsystem_filters(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - __free_filter(call->filter); - call->filter = NULL; + __free_subsystem_filter(file); } } @@ -1617,15 +1682,85 @@ fail: return err; } +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, + struct event_filter *filter) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + rcu_assign_pointer(call->filter, filter); + else + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + RCU_INIT_POINTER(call->filter, NULL); + else + RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + return true; + + if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && + (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) + return true; + + return false; +} + struct filter_list { struct list_head list; struct event_filter *filter; }; static int replace_system_preds(struct event_subsystem *system, + struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { + struct ftrace_event_file *file; struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; @@ -1633,8 +1768,8 @@ static int replace_system_preds(struct event_subsystem *system, bool fail = true; int err; - list_for_each_entry(call, &ftrace_events, list) { - + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; @@ -1644,18 +1779,20 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + event_set_no_set_filter_flag(file); else - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + event_clear_no_set_filter_flag(file); } - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; + call = file->event_call; + if (strcmp(call->class->system, system->name) != 0) continue; - if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + if (event_no_set_filter_flag(file)) continue; filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1676,17 +1813,17 @@ static int replace_system_preds(struct event_subsystem *system, err = replace_preds(call, filter, ps, filter_string, false); if (err) { - filter_disable(call); + filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(ps, filter); } else - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = call->filter; - rcu_assign_pointer(call->filter, filter_item->filter); + filter = event_filter(file); + event_set_filter(file, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1806,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call, return err; } +int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(call, filter_str, set_str, filterp); +} + /** * create_system_filter - create a filter for an event_subsystem * @system: event_subsystem to create a filter for @@ -1816,6 +1960,7 @@ static int create_filter(struct ftrace_event_call *call, * and always remembers @filter_str. */ static int create_system_filter(struct event_subsystem *system, + struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; @@ -1824,7 +1969,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, ps, filter_str); + err = replace_system_preds(system, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -1840,20 +1985,25 @@ static int create_system_filter(struct event_subsystem *system, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) { + struct ftrace_event_call *call = file->event_call; struct event_filter *filter; int err; if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(call); - filter = call->filter; + filter_disable(file); + filter = event_filter(file); + if (!filter) return 0; - RCU_INIT_POINTER(call->filter, NULL); + + event_clear_filter(file); + /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); + return 0; } @@ -1866,14 +2016,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) * string */ if (filter) { - struct event_filter *tmp = call->filter; + struct event_filter *tmp; + tmp = event_filter(file); if (!err) - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); else - filter_disable(call); + filter_disable(file); - rcu_assign_pointer(call->filter, filter); + event_set_filter(file, filter); if (tmp) { /* Make sure the call is done with the filter */ @@ -1889,6 +2040,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; struct event_filter *filter; int err = 0; @@ -1901,18 +2053,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system); + filter_free_subsystem_filters(system, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, filter_string, &filter); + err = create_system_filter(system, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 000000000000..8efbb69b04f0 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + synchronize_sched(); /* make sure current triggers exit before free */ + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (!rec) { + data->ops->func(data); + continue; + } + filter = rcu_dereference(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (data->cmd_ops->post_trigger) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->func(data); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type & tt) + data->ops->func(data); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct ftrace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) + return NULL; + + return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data->ops, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ + char *command, *next = buff; + struct event_command *p; + int ret = -EINVAL; + + command = strsep(&next, ": \t"); + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long)buf); + return -EFAULT; + } + buf[cnt] = '\0'; + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + free_page((unsigned long)buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long)buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_printf(m, "%s", name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data; + list_for_each_entry_rcu(data, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + if (data->ops->free) + data->ops->free(data->ops, data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->filter || data->cmd_ops->post_trigger) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + char *trigger = NULL; + char *number; + int ret; + + /* separate the trigger from the filter (t:n [if filter]) */ + if (param && isdigit(param[0])) + trigger = strsep(¶m, " \t"); + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + ret = 0; + out: + return ret; + + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->event_call, filter_str, false, &filter); + if (ret) + goto out; + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .func = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .func = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .func = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .func = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + int ret = register_trigger(glob, ops, data, file); + + if (ret > 0 && tracing_alloc_snapshot() != 0) { + unregister_trigger(glob, ops, data, file); + ret = 0; + } + + return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .func = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .func = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .func = event_trigger_callback, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + * stacktrace_trigger() + * event_triggers_post_call() + * ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .func = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .func = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .post_trigger = true, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct enable_trigger_data { + struct ftrace_event_file *file; + bool enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->file->event_call->class->system, + enable_data->file->event_call->name); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + module_put(enable_data->file->event_call->mod); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct ftrace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + struct trace_array *tr = file->tr; + const char *system; + const char *event; + char *trigger; + char *number; + bool enable; + int ret; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (s:e:n [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + system = strsep(&trigger, ":"); + if (!trigger) + return -EINVAL; + + event = strsep(&trigger, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) { + kfree(trigger_data); + goto out; + } + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + enable_data->enable = enable; + enable_data->file = event_enable_file; + trigger_data->private_data = enable_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(event_enable_file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + return ret; + + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + module_put(event_enable_file->event_call->mod); + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + kfree(enable_data); + goto out; +} + +static int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +static void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct enable_trigger_data *enable_data; + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + enable_data = data->private_data; + if (enable_data && + (enable_data->file == test_enable_data->file)) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + + return 0; +} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b5c09242683d..0b99120d395c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -82,9 +82,9 @@ static struct trace_array *graph_array; * to fill in space into DURATION column. */ enum { - DURATION_FILL_FULL = -1, - DURATION_FILL_START = -2, - DURATION_FILL_END = -3, + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; static enum print_line_t @@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return -EBUSY; } + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in debugfs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + calltime = trace_clock_local(); index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; - *depth = index; + *depth = current->curr_ret_stack; return 0; } @@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - if (unlikely(index < 0)) { + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); /* Might as well panic, otherwise we have no where to go */ @@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } /* * The trace should run after decrementing the ret counter @@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); return 1; @@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) /* trace it when it is-nested-in or is a function enabled. */ if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || + ftrace_graph_ignore_irqs()) || (trace->depth < 0) || (max_depth && trace->depth >= max_depth)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_duration(DURATION_FILL_START, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = print_graph_duration(DURATION_FILL_END, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ - switch (duration) { - case DURATION_FILL_FULL: + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_START: + case FLAGS_FILL_START: ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_END: + case FLAGS_FILL_END: ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return TRACE_TYPE_PARTIAL_LINE; /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 243f6834d026..bdbae450c13e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,18 +27,12 @@ /** * Kprobe event core functions */ -struct trace_probe { +struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ const char *symbol; /* symbol name */ - struct ftrace_event_class class; - struct ftrace_event_call call; - struct list_head files; - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; struct event_file_link { @@ -46,56 +40,46 @@ struct event_file_link { struct list_head list; }; -#define SIZEOF_TRACE_PROBE(n) \ - (offsetof(struct trace_probe, args) + \ +#define SIZEOF_TRACE_KPROBE(n) \ + (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_probe_is_return(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) { - return tp->rp.handler != NULL; + return tk->rp.handler != NULL; } -static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) +static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) { - return tp->symbol ? tp->symbol : "unknown"; + return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { - return tp->rp.kp.offset; + return tk->rp.kp.offset; } -static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); + return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) -{ - return !!(tp->flags & TP_FLAG_REGISTERED); -} - -static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) -{ - return !!(kprobe_gone(&tp->rp.kp)); -} - -static __kprobes bool trace_probe_within_module(struct trace_probe *tp, - struct module *mod) +static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) { int len = strlen(mod->name); - const char *name = trace_probe_symbol(tp); + const char *name = trace_kprobe_symbol(tk); return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { - return !!strchr(trace_probe_symbol(tp), ':'); + return !!strchr(trace_kprobe_symbol(tk), ':'); } -static int register_probe_event(struct trace_probe *tp); -static int unregister_probe_event(struct trace_probe *tp); +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); @@ -104,45 +88,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; +} + +void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; +} + +/* + * Kprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); + } +} + +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +#define DEFINE_FETCH_symbol(type) \ +__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8 NULL +#define fetch_file_offset_u16 NULL +#define fetch_file_offset_u32 NULL +#define fetch_file_offset_u64 NULL +#define fetch_file_offset_string NULL +#define fetch_file_offset_string_size NULL + +/* Fetch type information table */ +const struct fetch_type kprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + /* * Allocate new trace_probe and initialize it (including kprobes). */ -static struct trace_probe *alloc_trace_probe(const char *group, +static struct trace_kprobe *alloc_trace_kprobe(const char *group, const char *event, void *addr, const char *symbol, unsigned long offs, int nargs, bool is_return) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = -ENOMEM; - tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); - if (!tp) + tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + if (!tk) return ERR_PTR(ret); if (symbol) { - tp->symbol = kstrdup(symbol, GFP_KERNEL); - if (!tp->symbol) + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) goto error; - tp->rp.kp.symbol_name = tp->symbol; - tp->rp.kp.offset = offs; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; } else - tp->rp.kp.addr = addr; + tk->rp.kp.addr = addr; if (is_return) - tp->rp.handler = kretprobe_dispatcher; + tk->rp.handler = kretprobe_dispatcher; else - tp->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.kp.pre_handler = kprobe_dispatcher; if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) + tk->tp.call.class = &tk->tp.class; + tk->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tk->tp.call.name) goto error; if (!group || !is_good_name(group)) { @@ -150,42 +313,42 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) + tk->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tp->list); - INIT_LIST_HEAD(&tp->files); - return tp; + INIT_LIST_HEAD(&tk->list); + INIT_LIST_HEAD(&tk->tp.files); + return tk; error: - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); return ERR_PTR(ret); } -static void free_trace_probe(struct trace_probe *tp) +static void free_trace_kprobe(struct trace_kprobe *tk) { int i; - for (i = 0; i < tp->nr_args; i++) - traceprobe_free_probe_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_free_probe_arg(&tk->tp.args[i]); - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.class->system); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); } -static struct trace_probe *find_trace_probe(const char *event, - const char *group) +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) { - struct trace_probe *tp; + struct trace_kprobe *tk; - list_for_each_entry(tp, &probe_list, list) - if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.class->system, group) == 0) - return tp; + list_for_each_entry(tk, &probe_list, list) + if (strcmp(tk->tp.call.name, event) == 0 && + strcmp(tk->tp.call.class->system, group) == 0) + return tk; return NULL; } @@ -194,7 +357,7 @@ static struct trace_probe *find_trace_probe(const char *event, * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ static int -enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { int ret = 0; @@ -208,17 +371,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } link->file = file; - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tk->tp.files); - tp->flags |= TP_FLAG_TRACE; + tk->tp.flags |= TP_FLAG_TRACE; } else - tp->flags |= TP_FLAG_PROFILE; + tk->tp.flags |= TP_FLAG_PROFILE; - if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { - if (trace_probe_is_return(tp)) - ret = enable_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); else - ret = enable_kprobe(&tp->rp.kp); + ret = enable_kprobe(&tk->rp.kp); } out: return ret; @@ -241,14 +404,14 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ static int -disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { struct event_file_link *link = NULL; int wait = 0; int ret = 0; if (file) { - link = find_event_file_link(tp, file); + link = find_event_file_link(&tk->tp, file); if (!link) { ret = -EINVAL; goto out; @@ -256,18 +419,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) list_del_rcu(&link->list); wait = 1; - if (!list_empty(&tp->files)) + if (!list_empty(&tk->tp.files)) goto out; - tp->flags &= ~TP_FLAG_TRACE; + tk->tp.flags &= ~TP_FLAG_TRACE; } else - tp->flags &= ~TP_FLAG_PROFILE; + tk->tp.flags &= ~TP_FLAG_PROFILE; - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); + if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); else - disable_kprobe(&tp->rp.kp); + disable_kprobe(&tk->rp.kp); wait = 1; } out: @@ -288,40 +451,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } /* Internal register function - just handle k*probes and flags */ -static int __register_trace_probe(struct trace_probe *tp) +static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(tp)) + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; - for (i = 0; i < tp->nr_args; i++) - traceprobe_update_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_update_arg(&tk->tp.args[i]); /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(tp)) - tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; else - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); else - ret = register_kprobe(&tp->rp.kp); + ret = register_kprobe(&tk->rp.kp); if (ret == 0) - tp->flags |= TP_FLAG_REGISTERED; + tk->tp.flags |= TP_FLAG_REGISTERED; else { pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_probe_symbol(tp), trace_probe_offset(tp), ret); - if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warning("This probe might be able to register after" "target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { pr_warning("Probing address(0x%p) is not an " "instruction boundary.\n", - tp->rp.kp.addr); + tk->rp.kp.addr); ret = -EINVAL; } } @@ -330,67 +493,67 @@ static int __register_trace_probe(struct trace_probe *tp) } /* Internal unregister function - just handle k*probes and flags */ -static void __unregister_trace_probe(struct trace_probe *tp) +static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); else - unregister_kprobe(&tp->rp.kp); - tp->flags &= ~TP_FLAG_REGISTERED; + unregister_kprobe(&tk->rp.kp); + tk->tp.flags &= ~TP_FLAG_REGISTERED; /* Cleanup kprobe for reuse */ - if (tp->rp.kp.symbol_name) - tp->rp.kp.addr = NULL; + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; } } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static int unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ - if (trace_probe_is_enabled(tp)) + if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; /* Will fail if probe is being used by ftrace or perf */ - if (unregister_probe_event(tp)) + if (unregister_kprobe_event(tk)) return -EBUSY; - __unregister_trace_probe(tp); - list_del(&tp->list); + __unregister_trace_kprobe(tk); + list_del(&tk->list); return 0; } /* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) +static int register_trace_kprobe(struct trace_kprobe *tk) { - struct trace_probe *old_tp; + struct trace_kprobe *old_tk; int ret; mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tp = find_trace_probe(tp->call.name, tp->call.class->system); - if (old_tp) { - ret = unregister_trace_probe(old_tp); + old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + if (old_tk) { + ret = unregister_trace_kprobe(old_tk); if (ret < 0) goto end; - free_trace_probe(old_tp); + free_trace_kprobe(old_tk); } /* Register new event */ - ret = register_probe_event(tp); + ret = register_kprobe_event(tk); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } /* Register k*probe */ - ret = __register_trace_probe(tp); + ret = __register_trace_kprobe(tk); if (ret < 0) - unregister_probe_event(tp); + unregister_kprobe_event(tk); else - list_add_tail(&tp->list, &probe_list); + list_add_tail(&tk->list, &probe_list); end: mutex_unlock(&probe_lock); @@ -398,11 +561,11 @@ end: } /* Module notifier call back, checking event on the module */ -static int trace_probe_module_callback(struct notifier_block *nb, +static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; - struct trace_probe *tp; + struct trace_kprobe *tk; int ret; if (val != MODULE_STATE_COMING) @@ -410,15 +573,15 @@ static int trace_probe_module_callback(struct notifier_block *nb, /* Update probes on coming module */ mutex_lock(&probe_lock); - list_for_each_entry(tp, &probe_list, list) { - if (trace_probe_within_module(tp, mod)) { + list_for_each_entry(tk, &probe_list, list) { + if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ - __unregister_trace_probe(tp); - ret = __register_trace_probe(tp); + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tp->call.name, mod->name, ret); + tk->tp.call.name, mod->name, ret); } } mutex_unlock(&probe_lock); @@ -426,12 +589,12 @@ static int trace_probe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; } -static struct notifier_block trace_probe_module_nb = { - .notifier_call = trace_probe_module_callback, +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; -static int create_trace_probe(int argc, char **argv) +static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: @@ -451,7 +614,7 @@ static int create_trace_probe(int argc, char **argv) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_probe *tp; + struct trace_kprobe *tk; int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; @@ -498,16 +661,16 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_trace_probe(event, group); - if (!tp) { + tk = find_trace_kprobe(event, group); + if (!tk) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ - ret = unregister_trace_probe(tp); + ret = unregister_trace_kprobe(tk); if (ret == 0) - free_trace_probe(tp); + free_trace_kprobe(tk); mutex_unlock(&probe_lock); return ret; } @@ -554,47 +717,49 @@ static int create_trace_probe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, is_return); - if (IS_ERR(tp)) { + if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tp)); - return PTR_ERR(tp); + (int)PTR_ERR(tk)); + return PTR_ERR(tk); } /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + /* Increment count for freeing args in error case */ - tp->nr_args++; + tk->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tp->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tp->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tp->args[i].name)) { + if (!is_good_name(parg->name)) { pr_info("Invalid argument[%d] name: %s\n", - i, tp->args[i].name); + i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tp->args[i].name, - tp->args, i)) { + if (traceprobe_conflict_field_name(parg->name, + tk->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -602,7 +767,7 @@ static int create_trace_probe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], + ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); @@ -610,35 +775,35 @@ static int create_trace_probe(int argc, char **argv) } } - ret = register_trace_probe(tp); + ret = register_trace_kprobe(tk); if (ret) goto error; return 0; error: - free_trace_probe(tp); + free_trace_kprobe(tk); return ret; } -static int release_all_trace_probes(void) +static int release_all_trace_kprobes(void) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = 0; mutex_lock(&probe_lock); /* Ensure no probe is in use. */ - list_for_each_entry(tp, &probe_list, list) - if (trace_probe_is_enabled(tp)) { + list_for_each_entry(tk, &probe_list, list) + if (trace_probe_is_enabled(&tk->tp)) { ret = -EBUSY; goto end; } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { - tp = list_entry(probe_list.next, struct trace_probe, list); - ret = unregister_trace_probe(tp); + tk = list_entry(probe_list.next, struct trace_kprobe, list); + ret = unregister_trace_kprobe(tk); if (ret) goto end; - free_trace_probe(tp); + free_trace_kprobe(tk); } end: @@ -666,22 +831,22 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); - if (!tp->symbol) - seq_printf(m, " 0x%p", tp->rp.kp.addr); - else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", trace_probe_symbol(tp), - tp->rp.kp.offset); + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); else - seq_printf(m, " %s", trace_probe_symbol(tp)); + seq_printf(m, " %s", trace_kprobe_symbol(tk)); - for (i = 0; i < tp->nr_args; i++) - seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -699,7 +864,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_probes(); + ret = release_all_trace_kprobes(); if (ret < 0) return ret; } @@ -711,7 +876,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return traceprobe_probes_write(file, buffer, count, ppos, - create_trace_probe); + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -726,10 +891,10 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, - tp->rp.kp.nmissed); + seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + tk->rp.kp.nmissed); return 0; } @@ -754,57 +919,9 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, - struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, - struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - /* Kprobe handler */ static __kprobes void -__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { struct kprobe_trace_entry_head *entry; @@ -812,18 +929,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -832,26 +949,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)tp->rp.kp.addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kprobe_trace_func(tp, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kprobe_trace_func(tk, regs, link->file); } /* Kretprobe handler */ static __kprobes void -__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -860,18 +976,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -880,23 +996,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kretprobe_trace_func(tp, ri, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kretprobe_trace_func(tk, ri, regs, link->file); } /* Event entry printers */ @@ -983,16 +1098,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1004,17 +1121,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1022,74 +1141,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ - int i; - int pos = 0; - - const char *fmt, *arg; - - if (!trace_probe_is_return(tp)) { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } else { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ - int len; - char *print_fmt; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1); - tp->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes void -kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1099,8 +1157,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1108,18 +1166,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (!entry) return; - entry->ip = (unsigned long)tp->rp.kp.addr; + entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ static __kprobes void -kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1129,8 +1187,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1138,9 +1196,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (!entry) return; - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ @@ -1155,20 +1213,20 @@ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_probe *tp = (struct trace_probe *)event->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, file); + return enable_trace_kprobe(tk, file); case TRACE_REG_UNREGISTER: - return disable_trace_probe(tp, file); + return disable_trace_kprobe(tk, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, NULL); + return enable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_probe(tp, NULL); + return disable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1182,15 +1240,15 @@ int kprobe_register(struct ftrace_event_call *event, static __kprobes int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(tp, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(tp, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kprobe_perf_func(tk, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1198,15 +1256,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) static __kprobes int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1219,21 +1277,21 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_probe_event(struct trace_probe *tp) +static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; int ret; /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (trace_probe_is_return(tp)) { + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } - if (set_print_fmt(tp) < 0) + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); if (!ret) { @@ -1242,7 +1300,7 @@ static int register_probe_event(struct trace_probe *tp) } call->flags = 0; call->class->reg = kprobe_register; - call->data = tp; + call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); @@ -1252,14 +1310,14 @@ static int register_probe_event(struct trace_probe *tp) return ret; } -static int unregister_probe_event(struct trace_probe *tp) +static int unregister_kprobe_event(struct trace_kprobe *tk) { int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tp->call); + ret = trace_remove_event_call(&tk->tp.call); if (!ret) - kfree(tp->call.print_fmt); + kfree(tk->tp.call.print_fmt); return ret; } @@ -1269,7 +1327,7 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; - if (register_module_notifier(&trace_probe_module_nb)) + if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; d_tracer = tracing_init_dentry(); @@ -1309,26 +1367,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, } static struct ftrace_event_file * -find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct ftrace_event_file *file; list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tp->call) + if (file->event_call == &tk->tp.call) return file; return NULL; } /* - * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this * stage, we can do this lockless. */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); - struct trace_probe *tp; + struct trace_kprobe *tk; struct ftrace_event_file *file; target = kprobe_trace_selftest_target; @@ -1337,44 +1395,44 @@ static __init int kprobe_trace_self_tests_init(void) ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " "$stack $stack0 +0($stack)", - create_trace_probe); + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_probe); + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } @@ -1384,46 +1442,46 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); /* Disable trace points before removing it */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - ret = traceprobe_command("-:testprobe", create_trace_probe); + ret = traceprobe_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = traceprobe_command("-:testprobe2", create_trace_probe); + ret = traceprobe_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_probes(); + release_all_trace_kprobes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b3dcfb2f0fef..0abd9b863474 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; - need_resched = - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; + default: + need_resched = '.'; + break; + } + hardsoft_irq = (hardirq && softirq) ? 'H' : hardirq ? 'h' : diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 412e959709b4..8364a421b4df 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -35,46 +35,27 @@ const char *reserved_field_names[] = { FIELD_STRING_FUNC, }; -/* Printing function type */ -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ const char *name, \ - void *data, void *ent)\ + void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ + return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -static inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) -{ - return (u8 *)ent + get_rloc_offs(*dl); -} - -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, +__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, void *data, void *ent) { @@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, (const char *)get_loc_data(data, ent)); } -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; #define CHECK_FETCH_FUNCS(method, fn) \ (((FETCH_FUNC_NAME(method, u8) == fn) || \ @@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ +__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ @@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg) #define fetch_reg_string NULL #define fetch_reg_string_size NULL -#define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - #define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ @@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval) #define fetch_retval_string NULL #define fetch_retval_string_size NULL -#define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - long ret; - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); - } -} - -/* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -#define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - /* Dereference memory access function */ struct deref_fetch_param { struct fetch_param orig; long offset; + fetch_func_t fetch; + fetch_func_t fetch_size; }; #define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ @@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ call_fetch(&dprm->orig, regs, &addr); \ if (addr) { \ addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ + dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) + +__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + struct deref_fetch_param *dprm = data; + unsigned long addr; + + call_fetch(&dprm->orig, regs, &addr); + if (addr && dprm->fetch_size) { + addr += dprm->offset; + dprm->fetch_size(regs, (void *)addr, dest); + } else + *(string_size *)dest = 0; +} static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) { @@ -329,7 +166,7 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ @@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - -/* Fetch type information table */ -static const struct fetch_type fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, + const struct fetch_type *ftbl) { int i; @@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", ftbl); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", ftbl); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", ftbl); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", ftbl); default: goto fail; } } - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; + for (i = 0; ftbl[i].name; i++) { + if (strcmp(type, ftbl[i].name) == 0) + return &ftbl[i]; + } fail: return NULL; } /* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, + void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +static __kprobes void fetch_user_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = user_stack_pointer(regs); +} + static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) + fetch_func_t orig_fn, + const struct fetch_type *ftbl) { int i; - if (type != &fetch_type_table[FETCH_TYPE_STRING]) + if (type != &ftbl[FETCH_TYPE_STRING]) return NULL; /* Only string type needs size function */ for (i = 0; i < FETCH_MTD_END; i++) if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; WARN_ON(1); /* This should not happen */ @@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return) + struct fetch_param *f, bool is_return, + bool is_kprobe) { int ret = 0; unsigned long param; @@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) + return -EINVAL; + + if (is_kprobe) + f->fn = fetch_kernel_stack_address; else - ret = -EINVAL; + f->fn = fetch_user_stack_address; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) + if (ret || (is_kprobe && param > PARAM_MAX_STACK)) ret = -EINVAL; else { f->fn = t->fetch[FETCH_MTD_stack]; @@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, static int parse_probe_arg(char *arg, const struct fetch_type *t, struct fetch_param *f, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; - int ret; - - ret = 0; + int ret = 0; - /* Until uprobe_events supports only reg arguments */ - if (!is_kprobe && arg[0] != '%') - return -EINVAL; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); break; case '%': /* named register */ @@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, } break; - case '@': /* memory or symbol */ + case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) @@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (is_kprobe) + return -EINVAL; + + ret = kstrtol(arg + 2, 0, &offset); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_file_offset]; + f->data = (void *)offset; } else { + /* uprobes don't support symbols */ + if (!is_kprobe) + return -EINVAL; + ret = traceprobe_split_symbol_offset(arg + 1, &offset); if (ret) break; @@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, struct deref_fetch_param *dprm; const struct fetch_type *t2; - t2 = find_fetch_type(NULL); + t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); @@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, return -ENOMEM; dprm->offset = offset; + dprm->fetch = t->fetch[FETCH_MTD_memory]; + dprm->fetch_size = get_fetch_size_function(t, + dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, is_kprobe); if (ret) @@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf, int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; const char *t; int ret; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; @@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); + parg->fetch.fn, + ftbl); parg->fetch_size.data = parg->fetch.data; } @@ -837,3 +657,65 @@ out: return ret; } + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + bool is_return) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!is_return) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, is_return); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, is_return); + tp->call.print_fmt = print_fmt; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5c7e09d10d74..b73574a5f429 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,6 +81,17 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ @@ -95,6 +106,7 @@ enum { FETCH_MTD_symbol, FETCH_MTD_deref, FETCH_MTD_bitfield, + FETCH_MTD_file_offset, FETCH_MTD_END, }; @@ -115,6 +127,148 @@ struct fetch_param { void *data; }; +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent); \ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type) \ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ + void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) \ +DECLARE_FETCH_FUNC(method, u8); \ +DECLARE_FETCH_FUNC(method, u16); \ +DECLARE_FETCH_FUNC(method, u32); \ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ +ASSIGN_FETCH_FUNC(file_offset, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* + * Fetch type information table. + * It's declared as a weak symbol due to conditional compilation. + */ +extern __weak const struct fetch_type kprobes_fetch_type_table[]; +extern __weak const struct fetch_type uprobes_fetch_type_table[]; + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8 NULL +#define fetch_symbol_u16 NULL +#define fetch_symbol_u32 NULL +#define fetch_symbol_u64 NULL +#define fetch_symbol_string NULL +#define fetch_symbol_string_size NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ + return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ + return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + struct probe_arg { struct fetch_param fetch; struct fetch_param fetch_size; @@ -124,6 +278,26 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_probe { + unsigned int flags; /* For TP_FLAG_* */ + struct ftrace_event_class class; + struct ftrace_event_call call; + struct list_head files; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + static inline __kprobes void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { @@ -158,3 +332,53 @@ extern ssize_t traceprobe_probes_write(struct file *file, int (*createfn)(int, char**)); extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static inline __kprobes int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static inline __kprobes void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4e98e3b257a3..3f34dc9b40f3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } @@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..6e32635e5e57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <trace/events/sched.h> #include "trace.h" @@ -27,6 +28,8 @@ static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -437,6 +440,7 @@ static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; + tracing_dl = 0; if (wakeup_task) put_task_struct(wakeup_task); @@ -472,9 +476,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; pc = preempt_count(); @@ -486,7 +498,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) arch_spin_lock(&wakeup_lock); /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) goto out_locked; /* reset the trace */ @@ -496,6 +509,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + wakeup_task = p; get_task_struct(wakeup_task); @@ -597,16 +619,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; @@ -674,6 +705,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = .use_max_tr = true, }; +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, +}; + __init static int init_wakeup_tracer(void) { int ret; @@ -686,6 +739,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + return 0; } core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; struct completion *x = data; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_setattr(current, &attr); /* Make it know we have a new prio */ complete(x); @@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* - * This is an RT task, do short sleeps to let - * others run. + * This will likely be the system top priority + * task, do short sleeps to let others run. */ msleep(100); } @@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tracing_max_latency; struct task_struct *p; - struct completion isrt; + struct completion is_ready; unsigned long count; int ret; - init_completion(&isrt); + init_completion(&is_ready); - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) while (p->on_rq) { /* - * Sleep to make sure the RT thread is asleep too. + * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, * but we want to make sure this test still works. */ msleep(100); } - init_completion(&isrt); + init_completion(&is_ready); wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&isrt); + wait_for_completion(&is_ready); /* stop the tracing. */ tracing_stop(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b20428c5efe2..e6be585cf06a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -382,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 847f88a6194b..7af67360b330 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, - struct rb_node *node) +static void __reset_stat_session(struct stat_session *session) { - struct stat_node *snode; - struct rb_node *parent = rb_parent(node); - - if (node->rb_left) - return node->rb_left; - else if (node->rb_right) - return node->rb_right; - else { - if (!parent) - ; - else if (parent->rb_left == node) - parent->rb_left = NULL; - else - parent->rb_right = NULL; + struct stat_node *snode, *n; - snode = container_of(node, struct stat_node, node); - if (ts->stat_release) - ts->stat_release(snode->stat); + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); kfree(snode); - - return parent; } -} - -static void __reset_stat_session(struct stat_session *session) -{ - struct rb_node *node = session->stat_root.rb_node; - - while (node) - node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 559329d9bd2f..759d5e004517 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -336,15 +343,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_current_check_discard(buffer, sys_data->enter_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -356,7 +362,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -377,10 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_current_check_discard(buffer, sys_data->exit_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, @@ -397,7 +407,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -415,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - clear_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -435,7 +445,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -453,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - clear_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 272261b5f94f..79e52d93860b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -51,22 +51,17 @@ struct trace_uprobe_filter { */ struct trace_uprobe { struct list_head list; - struct ftrace_event_class class; - struct ftrace_event_call call; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, args) + \ +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) static int register_uprobe_event(struct trace_uprobe *tu); @@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu); static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; + static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs); +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)get_user_stack_nth(regs, \ + ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + void __user *vaddr = (void __force __user *) addr; \ + \ + if (copy_from_user(&retval, vaddr, sizeof(type))) \ + *(type *)dest = 0; \ + else \ + *(type *) dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + u32 rloc = *(u32 *)dest; + int maxlen = get_rloc_len(rloc); + u8 *dst = get_rloc_data(dest); + void __user *src = (void __force __user *) addr; + + if (!maxlen) + return; + + ret = strncpy_from_user(dst, src, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + } +} + +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type) \ +static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + void *vaddr = (void *)translate_user_vaddr(offset); \ + \ + FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +const struct fetch_type uprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); @@ -114,13 +250,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - tu->call.class = &tu->class; - tu->call.name = kstrdup(event, GFP_KERNEL); - if (!tu->call.name) + tu->tp.call.class = &tu->tp.class; + tu->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tu->tp.call.name) goto error; - tu->class.system = kstrdup(group, GFP_KERNEL); - if (!tu->class.system) + tu->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tu->tp.class.system) goto error; INIT_LIST_HEAD(&tu->list); @@ -128,10 +264,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); + tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: - kfree(tu->call.name); + kfree(tu->tp.call.name); kfree(tu); return ERR_PTR(-ENOMEM); @@ -141,12 +278,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu) { int i; - for (i = 0; i < tu->nr_args; i++) - traceprobe_free_probe_arg(&tu->args[i]); + for (i = 0; i < tu->tp.nr_args; i++) + traceprobe_free_probe_arg(&tu->tp.args[i]); iput(tu->inode); - kfree(tu->call.class->system); - kfree(tu->call.name); + kfree(tu->tp.call.class->system); + kfree(tu->tp.call.name); kfree(tu->filename); kfree(tu); } @@ -156,8 +293,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->call.name, event) == 0 && - strcmp(tu->call.class->system, group) == 0) + if (strcmp(tu->tp.call.name, event) == 0 && + strcmp(tu->tp.call.class->system, group) == 0) return tu; return NULL; @@ -180,16 +317,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { - struct trace_uprobe *old_tp; + struct trace_uprobe *old_tu; int ret; mutex_lock(&uprobe_lock); /* register as an event */ - old_tp = find_probe_event(tu->call.name, tu->call.class->system); - if (old_tp) { + old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + if (old_tu) { /* delete old event */ - ret = unregister_trace_uprobe(old_tp); + ret = unregister_trace_uprobe(old_tu); if (ret) goto end; } @@ -210,7 +347,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -359,34 +496,36 @@ static int create_trace_uprobe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + /* Increment count for freeing args in error case */ - tu->nr_args++; + tu->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tu->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tu->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tu->args[i].name)) { - pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { + if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -394,7 +533,8 @@ static int create_trace_uprobe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); + ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + is_return, false); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -458,11 +598,11 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); - for (i = 0; i < tu->nr_args; i++) - seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -508,7 +648,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); return 0; } @@ -532,21 +672,117 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + static void uprobe_trace_print(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + struct uprobe_cpu_buffer *ucb; void *data; - int size, i; - struct ftrace_event_call *call = &tu->call; + int size, dsize, esize; + struct ftrace_event_call *call = &tu->tp.call; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) + return; + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + size = esize + tu->tp.size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size + tu->size, 0, 0); + size, 0, 0); if (!event) - return; + goto out; entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { @@ -558,11 +794,13 @@ static void uprobe_trace_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); + +out: + uprobe_buffer_put(ucb); } /* uprobe handler */ @@ -590,23 +828,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e int i; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, call.event); + tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) { - if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, entry)) + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + if (!parg->type->print(s, parg->name, data + parg->offset, entry)) goto partial; } @@ -617,11 +856,6 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) -{ - return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); -} - typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); @@ -631,29 +865,35 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; - if (is_trace_uprobe_enabled(tu)) + if (trace_probe_is_enabled(&tu->tp)) return -EINTR; + ret = uprobe_buffer_enable(); + if (ret < 0) + return ret; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - tu->flags |= flag; + tu->tp.flags |= flag; tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) - tu->flags &= ~flag; + tu->tp.flags &= ~flag; return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!is_trace_uprobe_enabled(tu)) + if (!trace_probe_is_enabled(&tu->tp)) return; WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->flags &= ~flag; + tu->tp.flags &= ~flag; + + uprobe_buffer_disable(); } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -671,12 +911,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) size = SIZEOF_TRACE_ENTRY(false); } /* Set argument names as fields */ - for (i = 0; i < tu->nr_args; i++) { - ret = trace_define_field(event_call, tu->args[i].type->fmttype, - tu->args[i].name, - size + tu->args[i].offset, - tu->args[i].type->size, - tu->args[i].type->is_signed, + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, size + parg->offset, + parg->type->size, parg->type->is_signed, FILTER_OTHER); if (ret) @@ -685,59 +925,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -#define LEN_OR_ZERO (len ? len - pos : 0) -static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) -{ - const char *fmt, *arg; - int i; - int pos = 0; - - if (is_ret_probe(tu)) { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } else { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } - - /* When len=0, we just calculate the needed length */ - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tu->args[i].name, tu->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tu->args[i].name); - } - - return pos; /* return the length of print_fmt */ -} -#undef LEN_OR_ZERO - -static int set_print_fmt(struct trace_uprobe *tu) -{ - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tu, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tu, print_fmt, len + 1); - tu->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS static bool __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) @@ -830,14 +1017,27 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void uprobe_perf_print(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; + struct uprobe_cpu_buffer *ucb; void *data; - int size, rctx, i; + int size, dsize, esize; + int rctx; - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return; + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); preempt_disable(); head = this_cpu_ptr(call->perf_events); @@ -857,12 +1057,18 @@ static void uprobe_perf_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); + uprobe_buffer_put(ucb); } /* uprobe profile handler */ @@ -920,16 +1126,22 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; - if (tu->flags & TP_FLAG_TRACE) + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; + + if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) + if (tu->tp.flags & TP_FLAG_PROFILE) ret |= uprobe_perf_func(tu, regs); #endif return ret; @@ -939,14 +1151,20 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; tu = container_of(con, struct trace_uprobe, consumer); - if (tu->flags & TP_FLAG_TRACE) + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs); #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) + if (tu->tp.flags & TP_FLAG_PROFILE) uretprobe_perf_func(tu, func, regs); #endif return 0; @@ -958,7 +1176,7 @@ static struct trace_event_functions uprobe_funcs = { static int register_uprobe_event(struct trace_uprobe *tu) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; int ret; /* Initialize ftrace_event_call */ @@ -966,7 +1184,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - if (set_print_fmt(tu) < 0) + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); @@ -993,11 +1211,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->call); + ret = trace_remove_event_call(&tu->tp.call); if (ret) return ret; - kfree(tu->call.print_fmt); - tu->call.print_fmt = NULL; + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; return 0; } diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e41..509403e3fbc6 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +void __smp_call_function_single(int cpu, struct call_single_data *csd, + int wait) +{ + unsigned long flags; + + local_irq_save(flags); + csd->func(csd->info); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__smp_call_function_single); + int on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e69..c006131beb77 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,10 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, +#ifdef CONFIG_PERSISTENT_KEYRINGS + .persistent_keyring_register_sem = + __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba58..240fb62cf394 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,6 +101,9 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); +#ifdef CONFIG_PERSISTENT_KEYRINGS + init_rwsem(&ns->persistent_keyring_register_sem); +#endif return 0; } @@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); ns = parent; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03ebc..82ef9f3b7473 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); - ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, + GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; @@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); - /* if dying, only works from the same workqueue are allowed */ + /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + set_user_nice(worker->task, pool->attrs->nice); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ - set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* * The caller is responsible for ensuring %POOL_DISASSOCIATED * remains stable across this function. See the comments above the @@ -2840,19 +2851,6 @@ already_gone: return false; } -static bool __flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } -} - /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2866,10 +2864,18 @@ static bool __flush_work(struct work_struct *work) */ bool flush_work(struct work_struct *work) { + struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - return __flush_work(work); + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } } EXPORT_SYMBOL_GPL(flush_work); @@ -4106,7 +4112,7 @@ out_unlock: static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; - int cpu; + int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4126,6 +4132,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; + } else if (wq->flags & __WQ_ORDERED) { + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -4776,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb, /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); break; } return NOTIFY_OK; @@ -4814,14 +4828,8 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - - /* - * The work item is on-stack and can't lead to deadlock through - * flushing. Use __flush_work() to avoid spurious lockdep warnings - * when work_on_cpu()s are nested. - */ - __flush_work(&wfc.work); - + flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); @@ -5009,10 +5017,6 @@ static int __init init_workqueues(void) int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; - /* make sure we have enough bits for OFFQ pool ID */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < - WORK_CPU_END * NR_STD_WORKER_POOLS); - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5051,13 +5055,23 @@ static int __init init_workqueues(void) } } - /* create default unbound wq attrs */ + /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; + + /* + * An ordered wq should have only one pwq as ordering is + * guaranteed by max_active which is enforced by pwqs. + * Turn off NUMA so that dfl_pwq is used for all nodes. + */ + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + attrs->no_numa = true; + ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); |