diff options
author | Ingo Molnar <mingo@elte.hu> | 2010-08-19 12:48:09 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-08-19 12:48:09 +0200 |
commit | c8710ad38900153af7a3e6762e99c062cfa46443 (patch) | |
tree | a0c0632274c4eb72f51e99a5861f71cffe65ea60 /kernel | |
parent | 6016ee13db518ab1cd0cbf43fc2ad5712021e338 (diff) | |
parent | 86397dc3ccfc0e17b7550d05eaf15fe91f6498dd (diff) |
Merge branch 'tip/perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/core
Diffstat (limited to 'kernel')
87 files changed, 7390 insertions, 4228 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ce53fb2bd1d9..0b72d1a74be0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,10 +70,11 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o @@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/acct.c b/kernel/acct.c index 385b88461c29..fa7eb3de2ddc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(file->f_path.dentry, &sbuf)) + if (vfs_statfs(&file->f_path, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18fe..cd9dbb913c77 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. */ #include <linux/async.h> -#include <linux/bug.h> #include <linux/module.h> #include <linux/wait.h> #include <linux/sched.h> -#include <linux/init.h> -#include <linux/kthread.h> -#include <linux/delay.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; -#define MAX_THREADS 256 #define MAX_WORK 32768 static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); -static int async_enabled = 0; - struct async_entry { - struct list_head list; - async_cookie_t cookie; - async_func_ptr *func; - void *data; - struct list_head *running; + struct list_head list; + struct work_struct work; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); -static DECLARE_WAIT_QUEUE_HEAD(async_new); static atomic_t entry_count; -static atomic_t thread_count; extern int initcall_debug; @@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) spin_unlock_irqrestore(&async_lock, flags); return ret; } + /* * pick the first pending entry and run it */ -static void run_one_entry(void) +static void async_run_entry_fn(struct work_struct *work) { + struct async_entry *entry = + container_of(work, struct async_entry, work); unsigned long flags; - struct async_entry *entry; ktime_t calltime, delta, rettime; - /* 1) pick one task from the pending queue */ - + /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - if (list_empty(&async_pending)) - goto out; - entry = list_first_entry(&async_pending, struct async_entry, list); - - /* 2) move it to the running queue */ list_move_tail(&entry->list, entry->running); spin_unlock_irqrestore(&async_lock, flags); - /* 3) run it (and print duration)*/ + /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); @@ -153,31 +142,25 @@ static void run_one_entry(void) (long long)ktime_to_ns(delta) >> 10); } - /* 4) remove it from the running queue */ + /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - /* 5) free the entry */ + /* 4) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 6) wake up any waiters. */ + /* 5) wake up any waiters */ wake_up(&async_done); - return; - -out: - spin_unlock_irqrestore(&async_lock, flags); } - static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) { struct async_entry *entry; unsigned long flags; async_cookie_t newcookie; - /* allow irq-off callers */ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); @@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + if (!entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l ptr(data, newcookie); return newcookie; } + INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; entry->running = running; @@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l list_add_tail(&entry->list, &async_pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - wake_up(&async_new); + + /* schedule for execution */ + queue_work(system_unbound_wq, &entry->work); + return newcookie; } @@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); - - -static int async_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int ret = HZ; - set_current_state(TASK_INTERRUPTIBLE); - /* - * check the list head without lock.. false positives - * are dealt with inside run_one_entry() while holding - * the lock. - */ - rmb(); - if (!list_empty(&async_pending)) - run_one_entry(); - else - ret = schedule_timeout(HZ); - - if (ret == 0) { - /* - * we timed out, this means we as thread are redundant. - * we sign off and die, but we to avoid any races there - * is a last-straw check to see if work snuck in. - */ - atomic_dec(&thread_count); - wmb(); /* manager must see our departure first */ - if (list_empty(&async_pending)) - break; - /* - * woops work came in between us timing out and us - * signing off; we need to stay alive and keep working. - */ - atomic_inc(&thread_count); - } - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int async_manager_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int tc, ec; - - set_current_state(TASK_INTERRUPTIBLE); - - tc = atomic_read(&thread_count); - rmb(); - ec = atomic_read(&entry_count); - - while (tc < ec && tc < MAX_THREADS) { - if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", - tc))) { - msleep(100); - continue; - } - atomic_inc(&thread_count); - tc++; - } - - schedule(); - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int __init async_init(void) -{ - async_enabled = - !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); - - WARN_ON(!async_enabled); - return 0; -} - -core_initcall(async_init); diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26631a2..d96045789b54 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,7 +56,6 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> -#include <linux/inotify.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/audit.h b/kernel/audit.h index 208687be4f30..f7206db4e13d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); + /* audit watch functions */ -extern unsigned long audit_watch_inode(struct audit_watch *watch); -extern dev_t audit_watch_dev(struct audit_watch *watch); +#ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); -extern int audit_add_watch(struct audit_krule *krule); -extern void audit_remove_watch(struct audit_watch *watch); -extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); -extern void audit_inotify_unregister(struct list_head *in_list); +extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); +extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern struct list_head *audit_watch_rules(struct audit_watch *watch); - -extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +#else +#define audit_put_watch(w) {} +#define audit_get_watch(w) {} +#define audit_to_watch(k, p, l, o) (-EINVAL) +#define audit_add_watch(k, l) (-EINVAL) +#define audit_remove_watch_rule(k) BUG() +#define audit_watch_path(w) "" +#define audit_watch_compare(w, i, d) 0 + +#endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 46a57b57a335..7f18d3a4527e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1,5 +1,5 @@ #include "audit.h" -#include <linux/inotify.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> @@ -22,7 +22,7 @@ struct audit_tree { struct audit_chunk { struct list_head hash; - struct inotify_watch watch; + struct fsnotify_mark mark; struct list_head trees; /* with root here */ int dead; int count; @@ -59,7 +59,7 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. @@ -68,7 +68,7 @@ static LIST_HEAD(prune_list); * that makes a difference. Some. */ -static struct inotify_handle *rtree_ih; +static struct fsnotify_group *audit_tree_group; static struct audit_tree *alloc_tree(const char *s) { @@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree) return tree->pathname; } -static struct audit_chunk *alloc_chunk(int count) -{ - struct audit_chunk *chunk; - size_t size; - int i; - - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); - if (!chunk) - return NULL; - - INIT_LIST_HEAD(&chunk->hash); - INIT_LIST_HEAD(&chunk->trees); - chunk->count = count; - atomic_long_set(&chunk->refs, 1); - for (i = 0; i < count; i++) { - INIT_LIST_HEAD(&chunk->owners[i].list); - chunk->owners[i].index = i; - } - inotify_init_watch(&chunk->watch); - return chunk; -} - static void free_chunk(struct audit_chunk *chunk) { int i; @@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu) audit_put_chunk(chunk); } +static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +{ + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + call_rcu(&chunk->head, __put_chunk); +} + +static struct audit_chunk *alloc_chunk(int count) +{ + struct audit_chunk *chunk; + size_t size; + int i; + + size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); + chunk = kzalloc(size, GFP_KERNEL); + if (!chunk) + return NULL; + + INIT_LIST_HEAD(&chunk->hash); + INIT_LIST_HEAD(&chunk->trees); + chunk->count = count; + atomic_long_set(&chunk->refs, 1); + for (i = 0; i < count; i++) { + INIT_LIST_HEAD(&chunk->owners[i].list); + chunk->owners[i].index = i; + } + fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + return chunk; +} + enum {HASH_SIZE = 128}; static struct list_head chunk_hash_heads[HASH_SIZE]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); @@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode) return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock is held by caller */ +/* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - struct list_head *list = chunk_hash(chunk->watch.inode); + struct fsnotify_mark *entry = &chunk->mark; + struct list_head *list; + + if (!entry->i.inode) + return; + list = chunk_hash(entry->i.inode); list_add_rcu(&chunk->hash, list); } @@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (p->watch.inode == inode) { + /* mark.inode may have gone NULL, but who cares? */ + if (p->mark.i.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p) static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); + struct fsnotify_mark *entry = &chunk->mark; struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; - if (!pin_inotify_watch(&chunk->watch)) { - /* - * Filesystem is shutting down; all watches are getting - * evicted, just take it off the node list for this - * tree and let the eviction logics take care of the - * rest. - */ - owner = p->owner; - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - return; - } + fsnotify_get_mark(entry); spin_unlock(&hash_lock); - /* - * pin_inotify_watch() succeeded, so the watch won't go away - * from under us. - */ - mutex_lock(&chunk->watch.inode->inotify_mutex); - if (chunk->dead) { - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_lock(&entry->lock); + if (chunk->dead || !entry->i.inode) { + spin_unlock(&entry->lock); goto out; } @@ -256,16 +249,17 @@ static void untag_chunk(struct node *p) list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; } new = alloc_chunk(size); if (!new) goto Fallback; - if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { + fsnotify_duplicate_mark(&new->mark, entry); + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); goto Fallback; } @@ -298,9 +292,9 @@ static void untag_chunk(struct node *p) list_for_each_entry(owner, &new->trees, same_root) owner->root = new; spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; Fallback: @@ -314,31 +308,33 @@ Fallback: p->owner = NULL; put_tree(owner); spin_unlock(&hash_lock); - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_unlock(&entry->lock); out: - unpin_inotify_watch(&chunk->watch); + fsnotify_put_mark(entry); spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) { + struct fsnotify_mark *entry; struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) return -ENOMEM; - if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { + entry = &chunk->mark; + if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { free_chunk(chunk); return -ENOSPC; } - mutex_lock(&inode->inotify_mutex); + spin_lock(&entry->lock); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); return 0; } chunk->owners[0].index = (1U << 31); @@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) } insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&inode->inotify_mutex); + spin_unlock(&entry->lock); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct inotify_watch *watch; + struct fsnotify_mark *old_entry, *chunk_entry; struct audit_tree *owner; struct audit_chunk *chunk, *old; struct node *p; int n; - if (inotify_find_watch(rtree_ih, inode, &watch) < 0) + old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + if (!old_entry) return create_chunk(inode, tree); - old = container_of(watch, struct audit_chunk, watch); + old = container_of(old_entry, struct audit_chunk, mark); /* are we already there? */ spin_lock(&hash_lock); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return 0; } } @@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return -ENOMEM; } - mutex_lock(&inode->inotify_mutex); - if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + chunk_entry = &chunk->mark; + + spin_lock(&old_entry->lock); + if (!old_entry->i.inode) { + /* old_entry is being shot, lets just lie */ + spin_unlock(&old_entry->lock); + fsnotify_put_mark(old_entry); free_chunk(chunk); + return -ENOENT; + } + + fsnotify_duplicate_mark(chunk_entry, old_entry); + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + spin_unlock(&old_entry->lock); + free_chunk(chunk); + fsnotify_put_mark(old_entry); return -ENOSPC; } + + /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ + spin_lock(&chunk_entry->lock); spin_lock(&hash_lock); + + /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); - put_inotify_watch(&chunk->watch); + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + + fsnotify_destroy_mark(chunk_entry); + + fsnotify_put_mark(chunk_entry); + fsnotify_put_mark(old_entry); return 0; } list_replace_init(&old->trees, &chunk->trees); @@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); - inotify_evict_watch(&old->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ - put_inotify_watch(&old->watch); /* and kill it */ + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -584,7 +601,9 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct inode *inode = find_chunk(node)->watch.inode; + struct audit_chunk *chunk = find_chunk(node); + /* this could be NULL if the watch is dieing else where... */ + struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); @@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list) * Here comes the stuff asynchronous to auditctl operations */ -/* inode->inotify_mutex is locked */ static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; @@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk) mutex_unlock(&audit_filter_mutex); } -static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static int audit_tree_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmonut_mark, + struct fsnotify_event *event) +{ + BUG(); + return -EOPNOTSUPP; +} + +static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - if (mask & IN_IGNORED) { - evict_chunk(chunk); - put_inotify_watch(watch); - } + evict_chunk(chunk); + fsnotify_put_mark(entry); } -static void destroy_watch(struct inotify_watch *watch) +static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - call_rcu(&chunk->head, __put_chunk); + return false; } -static const struct inotify_operations rtree_inotify_ops = { - .handle_event = handle_event, - .destroy_watch = destroy_watch, +static const struct fsnotify_ops audit_tree_ops = { + .handle_event = audit_tree_handle_event, + .should_send_event = audit_tree_send_event, + .free_group_priv = NULL, + .free_event_priv = NULL, + .freeing_mark = audit_tree_freeing_mark, }; static int __init audit_tree_init(void) { int i; - rtree_ih = inotify_init(&rtree_inotify_ops); - if (IS_ERR(rtree_ih)) - audit_panic("cannot initialize inotify handle for rectree watches"); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + if (IS_ERR(audit_tree_group)) + audit_panic("cannot initialize fsnotify group for rectree watches"); for (i = 0; i < HASH_SIZE; i++) INIT_LIST_HEAD(&chunk_hash_heads[i]); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8df43696f4ba..f0c9b2e7542d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -24,18 +24,18 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/fs.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/inotify.h> #include <linux/security.h> #include "audit.h" /* * Reference counting: * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED * event. Each audit_watch holds a reference to its associated parent. * * audit_watch: if added to lists, lifetime is from audit_init_watch() to @@ -51,40 +51,61 @@ struct audit_watch { unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ + struct list_head rules; /* anchor for krule->rlist */ }; struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ + struct list_head watches; /* anchor for audit_watch->wlist */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ }; -/* Inotify handle. */ -struct inotify_handle *audit_ih; +/* fsnotify handle. */ +struct fsnotify_group *audit_watch_group; -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 +/* fsnotify events we care about. */ +#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF +static void audit_free_parent(struct audit_parent *parent) +{ + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} -static void audit_free_parent(struct inotify_watch *i_watch) +static void audit_watch_free_mark(struct fsnotify_mark *entry) { struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); + parent = container_of(entry, struct audit_parent, mark); + audit_free_parent(parent); +} + +static void audit_get_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_get_mark(&parent->mark); +} + +static void audit_put_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_put_mark(&parent->mark); +} + +/* + * Find and return the audit_parent on the given inode. If found a reference + * is taken on this parent. + */ +static inline struct audit_parent *audit_find_parent(struct inode *inode) +{ + struct audit_parent *parent = NULL; + struct fsnotify_mark *entry; + + entry = fsnotify_find_inode_mark(audit_watch_group, inode); + if (entry) + parent = container_of(entry, struct audit_parent, mark); + + return parent; } void audit_get_watch(struct audit_watch *watch) @@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch) void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); + audit_put_parent(watch->parent); watch->parent = NULL; audit_put_watch(watch); /* match initial get */ } @@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -struct list_head *audit_watch_rules(struct audit_watch *watch) -{ - return &watch->rules; -} - -unsigned long audit_watch_inode(struct audit_watch *watch) +int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return watch->ino; -} - -dev_t audit_watch_dev(struct audit_watch *watch) -{ - return watch->dev; + return (watch->ino != (unsigned long)-1) && + (watch->ino == ino) && + (watch->dev == dev); } /* Initialize a parent watch entry. */ static struct audit_parent *audit_init_parent(struct nameidata *ndp) { + struct inode *inode = ndp->path.dentry->d_inode; struct audit_parent *parent; - s32 wd; + int ret; parent = kzalloc(sizeof(*parent), GFP_KERNEL); if (unlikely(!parent)) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); + + fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + parent->mark.mask = AUDIT_FS_WATCH; + ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + if (ret < 0) { + audit_free_parent(parent); + return ERR_PTR(ret); } return parent; @@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; - if (!audit_ih) + if (!audit_watch_group) return -EOPNOTSUPP; if (path[0] != '/' || path[len-1] == '/' || @@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) new->dev = old->dev; new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); + audit_get_parent(old->parent); new->parent = old->parent; out: @@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_entry *oentry, *nentry; mutex_lock(&audit_filter_mutex); + /* Run all of the watches on this parent looking for the one that + * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { if (audit_compare_dname_path(dname, owatch->path, NULL)) continue; /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) + if (invalidating && !audit_dummy_context()) audit_filter_inodes(current, current->audit_context); + /* updating ino will likely change which audit_hash_list we + * are on so we need a new watch for the new list */ nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { mutex_unlock(&audit_filter_mutex); @@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.rlist); list_del_rcu(&oentry->list); - nentry = audit_dupe_rule(&oentry->rule, nwatch); + nentry = audit_dupe_rule(&oentry->rule); if (IS_ERR(nentry)) { list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { int h = audit_hash_ino((u32)ino); + + /* + * nentry->rule.watch == oentry->rule.watch so + * we must drop that reference and set it to our + * new watch. + */ + audit_put_watch(nentry->rule.watch); + audit_get_watch(nwatch); + nentry->rule.watch = nwatch; list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); list_replace(&oentry->rule.list, @@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_entry *e; mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); @@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_remove_watch(w); } mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } + fsnotify_destroy_mark(&parent->mark); } /* Get path information necessary for adding watches. */ @@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) } } -/* Associate the given rule with an existing parent inotify_watch. +/* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, struct audit_parent *parent) @@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule, struct audit_watch *w, *watch = krule->watch; int watch_found = 0; + BUG_ON(!mutex_is_locked(&audit_filter_mutex)); + list_for_each_entry(w, &parent->watches, wlist) { if (strcmp(watch->path, w->path)) continue; @@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule, } if (!watch_found) { - get_inotify_watch(&parent->wdata); + audit_get_parent(parent); watch->parent = parent; list_add(&watch->wlist, &parent->watches); @@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule, /* Find a matching watch entry, or add this one. * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule) +int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; struct audit_parent *parent; struct nameidata *ndp = NULL, *ndw = NULL; - int ret = 0; + int h, ret = 0; mutex_unlock(&audit_filter_mutex); @@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule) goto error; } + mutex_lock(&audit_filter_mutex); + /* update watch filter fields */ if (ndw) { watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; watch->ino = ndw->path.dentry->d_inode->i_ino; } - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { + /* either find an old parent or attach a new one */ + parent = audit_find_parent(ndp->path.dentry->d_inode); + if (!parent) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); ret = PTR_ERR(parent); goto error; } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); + } - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); + audit_add_to_parent(krule, parent); - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); + /* match get in audit_find_parent or audit_init_parent */ + audit_put_parent(parent); + h = audit_hash_ino((u32)watch->ino); + *list = &audit_inode_hash[h]; error: audit_put_nd(ndp, ndw); /* NULL args OK */ return ret; } -void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +void audit_remove_watch_rule(struct audit_krule *krule) { struct audit_watch *watch = krule->watch; struct audit_parent *parent = watch->parent; @@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) audit_remove_watch(watch); if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, list); + audit_get_parent(parent); + fsnotify_destroy_mark(&parent->mark); + audit_put_parent(parent); } } } -/* Update watch data in audit rules based on inotify events. */ -static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + return true; +} + +/* Update watch data in audit rules based on fsnotify events. */ +static int audit_watch_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) { + struct inode *inode; + __u32 mask = event->mask; + const char *dname = event->file_name; struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); + parent = container_of(inode_mark, struct audit_parent, mark); - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) + BUG_ON(group != audit_watch_group); + + switch (event->data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = event->path.dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = event->inode; + break; + default: + BUG(); + inode = NULL; + break; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + else if (mask & (FS_DELETE|FS_MOVED_FROM)) audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { + else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); + + return 0; } -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, +static const struct fsnotify_ops audit_watch_fsnotify_ops = { + .should_send_event = audit_watch_should_send_event, + .handle_event = audit_watch_handle_event, + .free_group_priv = NULL, + .freeing_mark = NULL, + .free_event_priv = NULL, }; static int __init audit_watch_init(void) { - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + if (IS_ERR(audit_watch_group)) { + audit_watch_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } return 0; } -subsys_initcall(audit_watch_init); +device_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce08041f578d..eb7675499fb5 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e) { int i; struct audit_krule *erule = &e->rule; + /* some rules don't have associated watches */ if (erule->watch) audit_put_watch(erule->watch); @@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; - new->watch = NULL; new->field_count = old->field_count; + /* * note that we are OK with not refcounting here; audit_match_tree() * never dereferences tree and we can't get false positives there @@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, } } - if (watch) { - audit_get_watch(watch); - new->watch = watch; + if (old->watch) { + audit_get_watch(old->watch); + new->watch = old->watch; } return entry; @@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int h, err; + int err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry) if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule); + err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); goto error; } - /* entry->rule.watch may have changed during audit_add_watch() */ - watch = entry->rule.watch; - h = audit_hash_ino((u32)audit_watch_inode(watch)); - list = &audit_inode_hash[h]; } if (tree) { err = audit_add_tree_rule(&entry->rule); @@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - LIST_HEAD(inotify_list); int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry) } if (e->rule.watch) - audit_remove_watch_rule(&e->rule, &inotify_list); + audit_remove_watch_rule(&e->rule); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - if (!list_empty(&inotify_list)) - audit_inotify_unregister(&inotify_list); - out: if (watch) audit_put_watch(watch); /* match initial get */ @@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r) { struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; - struct audit_watch *watch; - struct audit_tree *tree; int err = 0; if (!security_audit_rule_known(r)) return 0; - watch = r->watch; - tree = r->tree; - nentry = audit_dupe_rule(r, watch); + nentry = audit_dupe_rule(r); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); - if (watch) + if (r->watch) list_del(&r->rlist); list_del_rcu(&entry->list); list_del(&r->list); } else { - if (watch) { - list_add(&nentry->rule.rlist, audit_watch_rules(watch)); - list_del(&r->rlist); - } else if (tree) + if (r->watch || r->tree) list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); list_replace(&r->list, &nentry->rule.list); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3828ad5fb8f1..1b31c130d034 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -65,7 +65,6 @@ #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/inotify.h> #include <linux/capability.h> #include <linux/fs_struct.h> @@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) - result = (name->dev == audit_watch_dev(rule->watch) && - name->ino == audit_watch_inode(rule->watch)); + if (name) + result = audit_watch_compare(rule->watch, name->ino, name->dev); break; case AUDIT_DIR: if (ctx) @@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode) struct audit_tree_refs *p; struct audit_chunk *chunk; int count; - if (likely(list_empty(&inode->inotify_watches))) + if (likely(hlist_empty(&inode->i_fsnotify_marks))) return; context = current->audit_context; p = context->trees; @@ -1769,7 +1767,7 @@ retry: seq = read_seqbegin(&rename_lock); for(;;) { struct inode *inode = d->d_inode; - if (inode && unlikely(!list_empty(&inode->inotify_watches))) { + if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { @@ -1837,13 +1835,8 @@ void __audit_getname(const char *name) context->names[context->name_count].ino = (unsigned long)-1; context->names[context->name_count].osid = 0; ++context->name_count; - if (!context->pwd.dentry) { - read_lock(¤t->fs->lock); - context->pwd = current->fs->pwd; - path_get(¤t->fs->pwd); - read_unlock(¤t->fs->lock); - } - + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); } /* audit_putname - intercept a putname request diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ac6f5b0a64b..192f88c5b0f9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->release_agent) return -EINVAL; opts->release_agent = - kstrndup(token + 14, PATH_MAX, GFP_KERNEL); + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; } else if (!strncmp(token, "name=", 5)) { @@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->name) return -EINVAL; opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN, + MAX_CGROUP_ROOT_NAMELEN - 1, GFP_KERNEL); if (!opts->name) return -ENOMEM; @@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct kobject *cgroup_kobj; + static inline struct cgroup *__d_cgrp(struct dentry *dentry) { return dentry->d_fsdata; @@ -1788,6 +1790,29 @@ out: return retval; } +/** + * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * @tsk: the task to be attached + */ +int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + struct cgroupfs_root *root; + struct cgroup *cur_cg; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + cur_cg = task_cgroup_from_root(current, root); + retval = cgroup_attach_task(cur_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -3871,9 +3896,18 @@ int __init cgroup_init(void) hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); BUG_ON(!init_root_id(&rootnode)); + + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) { + err = -ENOMEM; + goto out; + } + err = register_filesystem(&cgroup_fs_type); - if (err < 0) + if (err < 0) { + kobject_put(cgroup_kobj); goto out; + } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); diff --git a/kernel/compat.c b/kernel/compat.c index 5adab05a3172..e167efce8423 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs (); - - if (resource >= RLIM_NLIMITS) - return -EINVAL; if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || __get_user(r.rlim_cur, &rlim->rlim_cur) || @@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, r.rlim_cur = RLIM_INFINITY; if (r.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; - set_fs(KERNEL_DS); - ret = sys_setrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); - return ret; + return do_prlimit(current, resource, &r, NULL); } #ifdef COMPAT_RLIM_OLD_INFINITY @@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit (unsigned int resource, +asmlinkage long compat_sys_getrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; int ret; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_getrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); + ret = do_prlimit(current, resource, NULL, &r); if (!ret) { if (r.rlim_cur > COMPAT_RLIM_INFINITY) r.rlim_cur = COMPAT_RLIM_INFINITY; diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadde..b23c0979bbe7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -105,7 +105,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset heirarchy */ + /* used for walking a cpuset hierarchy */ struct list_head stack_list; }; @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/cred.c b/kernel/cred.c index 60bc8b1e32e6..9a3e22641fe7 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -22,10 +22,6 @@ #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) #else -static inline __attribute__((format(printf, 1, 2))) -void no_printk(const char *fmt, ...) -{ -} #define kdebug(FMT, ...) \ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) #endif diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 8bc5eeffec8a..3c2d4972d235 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -605,6 +605,8 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + if (error == -1) + continue; kgdb_connected = 0; } else { error = gdb_serial_stub(ks); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index e8fd6868682d..481a7bd2dfe7 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + * GDB remote protocol parser: */ -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - #ifdef CONFIG_KGDB_KDB static int gdbstub_read_wait(void) { @@ -123,8 +112,8 @@ static void get_packet(char *buffer) buffer[count] = 0; if (ch == '#') { - xmitcsum = hex(gdbstub_read_wait()) << 4; - xmitcsum += hex(gdbstub_read_wait()); + xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; + xmitcsum += hex_to_bin(gdbstub_read_wait()); if (checksum != xmitcsum) /* failed checksum */ @@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) * buf. Return a pointer to the last char put in buf (null). May * return an error. */ -int kgdb_mem2hex(char *mem, char *buf, int count) +char *kgdb_mem2hex(char *mem, char *buf, int count) { char *tmp; int err; @@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) tmp = buf + count; err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; + if (err) + return NULL; + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; } + *buf = 0; - return err; + return buf; } /* @@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) tmp_hex = tmp_raw - 1; while (tmp_hex >= buf) { tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; + *tmp_raw = hex_to_bin(*tmp_hex--); + *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } return probe_kernel_write(mem, tmp_raw, count); @@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } while (**ptr) { - hex_val = hex(**ptr); + hex_val = hex_to_bin(**ptr); if (hex_val < 0) break; @@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) return probe_kernel_write(mem, c, size); } +#if DBG_MAX_REG_NUM > 0 +void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_get_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} + +void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_set_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Write memory due to an 'M' or 'X' packet. */ static int write_mem_msg(int binary) { @@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) * remapped to negative TIDs. */ -#define BUF_THREAD_ID_SIZE 16 +#define BUF_THREAD_ID_SIZE 8 static char *pack_threadid(char *pkt, unsigned char *id) { - char *limit; + unsigned char *limit; + int lzero = 1; + + limit = id + (BUF_THREAD_ID_SIZE / 2); + while (id < limit) { + if (!lzero || *id != 0) { + pkt = pack_hex_byte(pkt, *id); + lzero = 0; + } + id++; + } - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); + if (lzero) + pkt = pack_hex_byte(pkt, 0); return pkt; } static void int_to_threadref(unsigned char *id, int value) { - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); + put_unaligned_be32(value, id); } static struct task_struct *getthread(struct pt_regs *regs, int tid) @@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) pack_hex_byte(&remcom_out_buffer[1], ks->signo); } -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) +static void gdb_get_regs_helper(struct kgdb_state *ks) { struct task_struct *thread; void *local_debuggerinfo; @@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) */ sleeping_thread_to_gdb_regs(gdb_regs, thread); } +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + gdb_get_regs_helper(ks); kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); } @@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) char *ptr = &remcom_in_buffer[1]; unsigned long length; unsigned long addr; - int err; + char *err; if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && kgdb_hex2long(&ptr, &length) > 0) { err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); + if (!err) + error_packet(remcom_out_buffer, -EINVAL); } else { error_packet(remcom_out_buffer, -EINVAL); } @@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) strcpy(remcom_out_buffer, "OK"); } +#if DBG_MAX_REG_NUM > 0 +static char *gdb_hex_reg_helper(int regnum, char *out) +{ + int i; + int offset = 0; + + for (i = 0; i < regnum; i++) + offset += dbg_reg_def[i].size; + return kgdb_mem2hex((char *)gdb_regs + offset, out, + dbg_reg_def[i].size); +} + +/* Handle the 'p' individual regster get */ +static void gdb_cmd_reg_get(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + + kgdb_hex2long(&ptr, ®num); + if (regnum >= DBG_MAX_REG_NUM) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + gdb_get_regs_helper(ks); + gdb_hex_reg_helper(regnum, remcom_out_buffer); +} + +/* Handle the 'P' individual regster set */ +static void gdb_cmd_reg_set(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + int i = 0; + + kgdb_hex2long(&ptr, ®num); + if (*ptr++ != '=' || + !(!kgdb_usethread || kgdb_usethread == current) || + !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + memset(gdb_regs, 0, sizeof(gdb_regs)); + while (i < sizeof(gdb_regs) * 2) + if (hex_to_bin(ptr[i]) >= 0) + i++; + else + break; + i = i / 2; + kgdb_hex2mem(ptr, (char *)gdb_regs, i); + dbg_set_reg(regnum, gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Handle the 'X' memory binary write bytes */ static void gdb_cmd_binwrite(struct kgdb_state *ks) { @@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) { struct task_struct *g; struct task_struct *p; - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; int i; int cpu; @@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) for_each_online_cpu(cpu) { ks->thr_query = 0; int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; i++; } @@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) do_each_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; ks->thr_query++; if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) @@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) int error = 0; int tmp; - /* Clear the out buffer. */ + /* Initialize comm buffer and globals. */ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; if (kgdb_connected) { - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; /* Reply to host that an exception has occurred */ @@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) put_packet(remcom_out_buffer); } - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - while (1) { error = 0; @@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_memwrite(ks); break; +#if DBG_MAX_REG_NUM > 0 + case 'p': /* pXX Return gdb register XX (in hex) */ + gdb_cmd_reg_get(ks); + break; + case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ + gdb_cmd_reg_set(ks); + break; +#endif /* DBG_MAX_REG_NUM > 0 */ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_binwrite(ks); break; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ebe4a287419e..28b844118bbd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) if (endp == arg) { /* - * Try base 16, for us folks too lazy to type the + * Also try base 16, for us folks too lazy to type the * leading 0x... */ val = simple_strtoul(arg, &endp, 16); @@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) return 0; } +int kdbgetu64arg(const char *arg, u64 *value) +{ + char *endp; + u64 val; + + val = simple_strtoull(arg, &endp, 0); + + if (endp == arg) { + + val = simple_strtoull(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + /* * kdb_set - This function implements the 'set' command. Alter an * existing environment variable or create a new one. @@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int diag = kdb_check_regs(); - if (diag) - return diag; + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 + int i; + char *rname; + int rsize; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (len) + return len; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + rsize = dbg_reg_def[i].size * 2; + if (rsize > 16) + rsize = 2; + if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { + len = 0; + kdb_printf("\n"); + } + if (len) + len += kdb_printf(" "); + switch(dbg_reg_def[i].size * 8) { + case 8: + rname = dbg_get_reg(i, ®8, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %02x", rname, reg8); + break; + case 16: + rname = dbg_get_reg(i, ®16, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %04x", rname, reg16); + break; + case 32: + rname = dbg_get_reg(i, ®32, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %08x", rname, reg32); + break; + case 64: + rname = dbg_get_reg(i, ®64, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %016llx", rname, reg64); + break; + default: + len += kdb_printf("%s: ??", dbg_reg_def[i].name); + } + } + kdb_printf("\n"); +#else + if (len) + return len; kdb_dumpregs(kdb_current_regs); +#endif return 0; } @@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) * kdb_rm - This function implements the 'rm' (register modify) command. * rm register-name new-contents * Remarks: - * Currently doesn't allow modification of control or - * debug registers. + * Allows register modification with the same restrictions as gdb */ static int kdb_rm(int argc, const char **argv) { +#if DBG_MAX_REG_NUM > 0 int diag; - int ind = 0; - unsigned long contents; + const char *rname; + int i; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; if (argc != 2) return KDB_ARGCOUNT; /* * Allow presence or absence of leading '%' symbol. */ - if (argv[1][0] == '%') - ind = 1; + rname = argv[1]; + if (*rname == '%') + rname++; - diag = kdbgetularg(argv[2], &contents); + diag = kdbgetu64arg(argv[2], ®64); if (diag) return diag; diag = kdb_check_regs(); if (diag) return diag; + + diag = KDB_BADREG; + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + if (strcmp(rname, dbg_reg_def[i].name) == 0) { + diag = 0; + break; + } + } + if (!diag) { + switch(dbg_reg_def[i].size * 8) { + case 8: + reg8 = reg64; + dbg_set_reg(i, ®8, kdb_current_regs); + break; + case 16: + reg16 = reg64; + dbg_set_reg(i, ®16, kdb_current_regs); + break; + case 32: + reg32 = reg64; + dbg_set_reg(i, ®32, kdb_current_regs); + break; + case 64: + dbg_set_reg(i, ®64, kdb_current_regs); + break; + } + } + return diag; +#else kdb_printf("ERROR: Register set currently not implemented\n"); - return 0; + return 0; +#endif } #if defined(CONFIG_MAGIC_SYSRQ) @@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { + struct timespec now; struct kdb_tm tm; struct sysinfo val; @@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - kdb_gmtime(&xtime, &tm); + now = __current_kernel_time(); + kdb_gmtime(&now, &tm); kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba69775d..c438f545a321 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); -extern int kdb_set(int, const char **); extern char *kdbgetenv(const char *); -extern int kdbgetintenv(const char *, int *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index dd62f8e714ca..0dbeae374225 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -134,23 +134,14 @@ unregister: return 0; } -int -__set_personality(unsigned int personality) +int __set_personality(unsigned int personality) { - struct exec_domain *ep, *oep; - - ep = lookup_exec_domain(personality); - if (ep == current_thread_info()->exec_domain) { - current->personality = personality; - module_put(ep->module); - return 0; - } + struct exec_domain *oep = current_thread_info()->exec_domain; + current_thread_info()->exec_domain = lookup_exec_domain(personality); current->personality = personality; - oep = current_thread_info()->exec_domain; - current_thread_info()->exec_domain = ep; - module_put(oep->module); + return 0; } @@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) { unsigned int old = current->personality; - if (personality != 0xffffffff) { + if (personality != 0xffffffff) set_personality(personality); - if (current->personality != personality) - return -EINVAL; - } return old; } diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67b564a..671ed56e0a49 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father) struct task_struct *p, *n, *reaper; LIST_HEAD(dead_children); - exit_ptrace(father); - write_lock_irq(&tasklist_lock); + /* + * Note that exit_ptrace() and find_new_reaper() might + * drop tasklist_lock and reacquire it. + */ + exit_ptrace(father); reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba047..98b450876f93 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; + sig->oom_score_adj = current->signal->oom_score_adj; return 0; } @@ -907,7 +908,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..ce669174f355 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); xts = __current_kernel_time(); - tom = wall_to_monotonic; + tom = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } @@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset; + struct timespec realtime_offset, wtm; unsigned long seq; if (!hrtimer_hres_active()) @@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) do { seq = read_seqbegin(&xtime_lock); - set_normalized_timespec(&realtime_offset, - -wall_to_monotonic.tv_sec, - -wall_to_monotonic.tv_nsec); + wtm = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e34d94d50924..d71a987fd2bf 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -242,6 +242,17 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, } /* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ +} + +/* * Contraints to check before allowing this new breakpoint counter: * * == Non-pinned counter == (Considered as pinned for now) @@ -343,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); + arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8a..c3003e9d91a3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_TIMER)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; desc->status |= IRQ_SUSPENDED; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 131b1703936f..c0613f7d6730 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); result = copy_from_user(image->segment, segments, segment_bytes); - if (result) + if (result) { + result = -EFAULT; goto out; + } /* * Verify we have good destination addresses. The caller is @@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { - result = (result < 0) ? result : -EIO; + result = -EFAULT; goto out; } ubytes -= uchunk; @@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image, kexec_flush_icache_page(page); kunmap(page); if (result) { - result = (result < 0) ? result : -EIO; + result = -EFAULT; goto out; } ubytes -= uchunk; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 35edbe22e9a9..4502604ecadf 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,8 +1,7 @@ /* - * A generic kernel FIFO implementation. + * A generic kernel FIFO implementation * - * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> - * Copyright (C) 2004 Stelian Pop <stelian@popies.net> + * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -11,7 +10,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License @@ -24,422 +23,579 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/kfifo.h> #include <linux/log2.h> #include <linux/uaccess.h> +#include <linux/kfifo.h> -static void _kfifo_init(struct kfifo *fifo, void *buffer, - unsigned int size) -{ - fifo->buffer = buffer; - fifo->size = size; - - kfifo_reset(fifo); -} - -/** - * kfifo_init - initialize a FIFO using a preallocated buffer - * @fifo: the fifo to assign the buffer - * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this has to be a power of 2. - * +/* + * internal helper to calculate the unused elements in a fifo */ -void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) +static inline unsigned int kfifo_unused(struct __kfifo *fifo) { - /* size must be a power of 2 */ - BUG_ON(!is_power_of_2(size)); - - _kfifo_init(fifo, buffer, size); + return (fifo->mask + 1) - (fifo->in - fifo->out); } -EXPORT_SYMBOL(kfifo_init); -/** - * kfifo_alloc - allocates a new FIFO internal buffer - * @fifo: the fifo to assign then new buffer - * @size: the size of the buffer to be allocated, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() - * - * This function dynamically allocates a new fifo internal buffer - * - * The size will be rounded-up to a power of 2. - * The buffer will be release with kfifo_free(). - * Return 0 if no error, otherwise the an error code - */ -int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) +int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) { - unsigned char *buffer; - /* - * round up to the next power of 2, since our 'let the indices + * round down to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ - if (!is_power_of_2(size)) { - BUG_ON(size > 0x80000000); - size = roundup_pow_of_two(size); + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + + if (size < 2) { + fifo->data = NULL; + fifo->mask = 0; + return -EINVAL; } - buffer = kmalloc(size, gfp_mask); - if (!buffer) { - _kfifo_init(fifo, NULL, 0); + fifo->data = kmalloc(size * esize, gfp_mask); + + if (!fifo->data) { + fifo->mask = 0; return -ENOMEM; } - - _kfifo_init(fifo, buffer, size); + fifo->mask = size - 1; return 0; } -EXPORT_SYMBOL(kfifo_alloc); +EXPORT_SYMBOL(__kfifo_alloc); -/** - * kfifo_free - frees the FIFO internal buffer - * @fifo: the fifo to be freed. - */ -void kfifo_free(struct kfifo *fifo) +void __kfifo_free(struct __kfifo *fifo) { - kfree(fifo->buffer); - _kfifo_init(fifo, NULL, 0); + kfree(fifo->data); + fifo->in = 0; + fifo->out = 0; + fifo->esize = 0; + fifo->data = NULL; + fifo->mask = 0; } -EXPORT_SYMBOL(kfifo_free); +EXPORT_SYMBOL(__kfifo_free); -/** - * kfifo_skip - skip output data - * @fifo: the fifo to be used. - * @len: number of bytes to skip - */ -void kfifo_skip(struct kfifo *fifo, unsigned int len) +int __kfifo_init(struct __kfifo *fifo, void *buffer, + unsigned int size, size_t esize) { - if (len < kfifo_len(fifo)) { - __kfifo_add_out(fifo, len); - return; + size /= esize; + + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + fifo->data = buffer; + + if (size < 2) { + fifo->mask = 0; + return -EINVAL; } - kfifo_reset_out(fifo); + fifo->mask = size - 1; + + return 0; } -EXPORT_SYMBOL(kfifo_skip); +EXPORT_SYMBOL(__kfifo_init); -static inline void __kfifo_in_data(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int off) +static void kfifo_copy_in(struct __kfifo *fifo, const void *src, + unsigned int len, unsigned int off) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(fifo->data + off, src, l); + memcpy(fifo->data, src + l, len - l); /* - * Ensure that we sample the fifo->out index -before- we - * start putting bytes into the kfifo. + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter */ + smp_wmb(); +} - smp_mb(); - - off = __kfifo_off(fifo, fifo->in + off); +unsigned int __kfifo_in(struct __kfifo *fifo, + const void *buf, unsigned int len) +{ + unsigned int l; - /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - off); - memcpy(fifo->buffer + off, from, l); + l = kfifo_unused(fifo); + if (len > l) + len = l; - /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, from + l, len - l); + kfifo_copy_in(fifo, buf, len, fifo->in); + fifo->in += len; + return len; } +EXPORT_SYMBOL(__kfifo_in); -static inline void __kfifo_out_data(struct kfifo *fifo, - void *to, unsigned int len, unsigned int off) +static void kfifo_copy_out(struct __kfifo *fifo, void *dst, + unsigned int len, unsigned int off) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(dst, fifo->data + off, l); + memcpy(dst + l, fifo->data, len - l); /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. + * make sure that the data is copied before + * incrementing the fifo->out index counter */ + smp_wmb(); +} - smp_rmb(); +unsigned int __kfifo_out_peek(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + unsigned int l; - off = __kfifo_off(fifo, fifo->out + off); + l = fifo->in - fifo->out; + if (len > l) + len = l; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - memcpy(to, fifo->buffer + off, l); + kfifo_copy_out(fifo, buf, len, fifo->out); + return len; +} +EXPORT_SYMBOL(__kfifo_out_peek); - /* then get the rest (if any) from the beginning of the buffer */ - memcpy(to + l, fifo->buffer, len - l); +unsigned int __kfifo_out(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + len = __kfifo_out_peek(fifo, buf, len); + fifo->out += len; + return len; } +EXPORT_SYMBOL(__kfifo_out); -static inline int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned *lenout) +static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned int *copied) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; - int ret; + unsigned long ret; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_from_user(fifo->data + off, from, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_from_user(fifo->data, from + l, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } /* - * Ensure that we sample the fifo->out index -before- we - * start putting bytes into the kfifo. + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} - smp_mb(); +int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; - off = __kfifo_off(fifo, fifo->in + off); + if (esize != 1) + len /= esize; - /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - off); - ret = copy_from_user(fifo->buffer + off, from, l); - if (unlikely(ret)) { - *lenout = ret; - return -EFAULT; - } - *lenout = l; + l = kfifo_unused(fifo); + if (len > l) + len = l; - /* then put the rest (if any) at the beginning of the buffer */ - ret = copy_from_user(fifo->buffer, from + l, len - l); - *lenout += ret ? ret : len - l; - return ret ? -EFAULT : 0; + ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->in += len; + return err; } +EXPORT_SYMBOL(__kfifo_from_user); -static inline int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off, unsigned *lenout) +static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, + unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; - int ret; - + unsigned long ret; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_to_user(to, fifo->data + off, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_to_user(to + l, fifo->data, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. + * make sure that the data is copied before + * incrementing the fifo->out index counter */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} - smp_rmb(); +int __kfifo_to_user(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; - off = __kfifo_off(fifo, fifo->out + off); + if (esize != 1) + len /= esize; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - ret = copy_to_user(to, fifo->buffer + off, l); - *lenout = l; + l = fifo->in - fifo->out; + if (len > l) + len = l; + ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { - *lenout -= ret; - return -EFAULT; - } + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->out += len; + return err; +} +EXPORT_SYMBOL(__kfifo_to_user); - /* then get the rest (if any) from the beginning of the buffer */ - len -= l; - ret = copy_to_user(to + l, fifo->buffer, len); - if (unlikely(ret)) { - *lenout += len - ret; - return -EFAULT; +static int setup_sgl_buf(struct scatterlist *sgl, void *buf, + int nents, unsigned int len) +{ + int n; + unsigned int l; + unsigned int off; + struct page *page; + + if (!nents) + return 0; + + if (!len) + return 0; + + n = 0; + page = virt_to_page(buf); + off = offset_in_page(buf); + l = 0; + + while (len >= l + PAGE_SIZE - off) { + struct page *npage; + + l += PAGE_SIZE; + buf += PAGE_SIZE; + npage = virt_to_page(buf); + if (page_to_phys(page) != page_to_phys(npage) - l) { + sg_set_page(sgl, page, l - off, off); + sgl = sg_next(sgl); + if (++n == nents || sgl == NULL) + return n; + page = npage; + len -= l - off; + l = off = 0; + } } - *lenout += len; - return 0; + sg_set_page(sgl, page, len, off); + return n + 1; } -unsigned int __kfifo_in_n(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int recsize) +static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, + int nents, unsigned int len, unsigned int off) { - if (kfifo_avail(fifo) < len + recsize) - return len + 1; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned int n; - __kfifo_in_data(fifo, from, len, recsize); - return 0; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + n = setup_sgl_buf(sgl, fifo->data + off, nents, l); + n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); + + if (n) + sg_mark_end(sgl + n - 1); + return n; } -EXPORT_SYMBOL(__kfifo_in_n); -/** - * kfifo_in - puts some data into the FIFO - * @fifo: the fifo to be used. - * @from: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_in(struct kfifo *fifo, const void *from, - unsigned int len) +unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) { - len = min(kfifo_avail(fifo), len); + unsigned int l; - __kfifo_in_data(fifo, from, len, 0); - __kfifo_add_in(fifo, len); - return len; + l = kfifo_unused(fifo); + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->in); } -EXPORT_SYMBOL(kfifo_in); +EXPORT_SYMBOL(__kfifo_dma_in_prepare); -unsigned int __kfifo_in_generic(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) { - return __kfifo_in_rec(fifo, from, len, recsize); + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->out); } -EXPORT_SYMBOL(__kfifo_in_generic); +EXPORT_SYMBOL(__kfifo_dma_out_prepare); -unsigned int __kfifo_out_n(struct kfifo *fifo, - void *to, unsigned int len, unsigned int recsize) +unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { - if (kfifo_len(fifo) < len + recsize) - return len; + unsigned int max = (1 << (recsize << 3)) - 1; - __kfifo_out_data(fifo, to, len, recsize); - __kfifo_add_out(fifo, len + recsize); - return 0; + if (len > max) + return max; + return len; } -EXPORT_SYMBOL(__kfifo_out_n); -/** - * kfifo_out - gets some data from the FIFO - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. +#define __KFIFO_PEEK(data, out, mask) \ + ((data)[(out) & (mask)]) +/* + * __kfifo_peek_n internal helper function for determinate the length of + * the next record in the fifo */ -unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) +static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { - len = min(kfifo_len(fifo), len); + unsigned int l; + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; - __kfifo_out_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); + l = __KFIFO_PEEK(data, fifo->out, mask); - return len; + if (--recsize) + l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; + + return l; } -EXPORT_SYMBOL(kfifo_out); - -/** - * kfifo_out_peek - copy some data from the FIFO, but do not remove it - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * @offset: offset into the fifo - * - * This function copies at most @len bytes at @offset from the FIFO - * into the @to buffer and returns the number of copied bytes. - * The data is not removed from the FIFO. + +#define __KFIFO_POKE(data, in, mask, val) \ + ( \ + (data)[(in) & (mask)] = (unsigned char)(val) \ + ) + +/* + * __kfifo_poke_n internal helper function for storeing the length of + * the record into the fifo */ -unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, - unsigned offset) +static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { - len = min(kfifo_len(fifo), len + offset); + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; - __kfifo_out_data(fifo, to, len, offset); - return len; + __KFIFO_POKE(data, fifo->in, mask, n); + + if (recsize > 1) + __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } -EXPORT_SYMBOL(kfifo_out_peek); -unsigned int __kfifo_out_generic(struct kfifo *fifo, - void *to, unsigned int len, unsigned int recsize, - unsigned int *total) +unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { - return __kfifo_out_rec(fifo, to, len, recsize, total); + return __kfifo_peek_n(fifo, recsize); } -EXPORT_SYMBOL(__kfifo_out_generic); +EXPORT_SYMBOL(__kfifo_len_r); -unsigned int __kfifo_from_user_n(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, + unsigned int len, size_t recsize) { - unsigned total; + if (len + recsize > kfifo_unused(fifo)) + return 0; - if (kfifo_avail(fifo) < len + recsize) - return len + 1; + __kfifo_poke_n(fifo, len, recsize); - __kfifo_from_user_data(fifo, from, len, recsize, &total); - return total; + kfifo_copy_in(fifo, buf, len, fifo->in + recsize); + fifo->in += len + recsize; + return len; } -EXPORT_SYMBOL(__kfifo_from_user_n); - -/** - * kfifo_from_user - puts some data from user space into the FIFO - * @fifo: the fifo to be used. - * @from: pointer to the data to be added. - * @len: the length of the data to be added. - * @total: the actual returned data length. - * - * This function copies at most @len bytes from the @from into the - * FIFO depending and returns -EFAULT/0. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned *total) +EXPORT_SYMBOL(__kfifo_in_r); + +static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, + void *buf, unsigned int len, size_t recsize, unsigned int *n) { - int ret; - len = min(kfifo_avail(fifo), len); - ret = __kfifo_from_user_data(fifo, from, len, 0, total); - if (ret) - return ret; - __kfifo_add_in(fifo, len); - return 0; + *n = __kfifo_peek_n(fifo, recsize); + + if (len > *n) + len = *n; + + kfifo_copy_out(fifo, buf, len, fifo->out + recsize); + return len; +} + +unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) +{ + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } -EXPORT_SYMBOL(kfifo_from_user); +EXPORT_SYMBOL(__kfifo_out_peek_r); -unsigned int __kfifo_from_user_generic(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) { - return __kfifo_from_user_rec(fifo, from, len, recsize); + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); + fifo->out += n + recsize; + return len; } -EXPORT_SYMBOL(__kfifo_from_user_generic); +EXPORT_SYMBOL(__kfifo_out_r); -unsigned int __kfifo_to_user_n(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int reclen, - unsigned int recsize) +int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied, size_t recsize) { - unsigned int ret, total; + unsigned long ret; - if (kfifo_len(fifo) < reclen + recsize) - return len; + len = __kfifo_max_r(len, recsize); - ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); + if (len + recsize > kfifo_unused(fifo)) { + *copied = 0; + return 0; + } - if (likely(ret == 0)) - __kfifo_add_out(fifo, reclen + recsize); + __kfifo_poke_n(fifo, len, recsize); - return total; + ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->in += len + recsize; + return 0; } -EXPORT_SYMBOL(__kfifo_to_user_n); - -/** - * kfifo_to_user - gets data from the FIFO and write it to user space - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * @lenout: pointer to output variable with copied data - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and 0 or -EFAULT. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned *lenout) +EXPORT_SYMBOL(__kfifo_from_user_r); + +int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied, size_t recsize) { - int ret; - len = min(kfifo_len(fifo), len); - ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); - __kfifo_add_out(fifo, *lenout); - return ret; + unsigned long ret; + unsigned int n; + + if (fifo->in == fifo->out) { + *copied = 0; + return 0; + } + + n = __kfifo_peek_n(fifo, recsize); + if (len > n) + len = n; + + ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->out += n + recsize; + return 0; } -EXPORT_SYMBOL(kfifo_to_user); +EXPORT_SYMBOL(__kfifo_to_user_r); -unsigned int __kfifo_to_user_generic(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int recsize, - unsigned int *total) +unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - return __kfifo_to_user_rec(fifo, to, len, recsize, total); + if (!nents) + BUG(); + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); } -EXPORT_SYMBOL(__kfifo_to_user_generic); +EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); -unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +void __kfifo_dma_in_finish_r(struct __kfifo *fifo, + unsigned int len, size_t recsize) { - if (recsize == 0) - return kfifo_avail(fifo); - - return __kfifo_peek_n(fifo, recsize); + len = __kfifo_max_r(len, recsize); + __kfifo_poke_n(fifo, len, recsize); + fifo->in += len + recsize; } -EXPORT_SYMBOL(__kfifo_peek_generic); +EXPORT_SYMBOL(__kfifo_dma_in_finish_r); -void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - __kfifo_skip_rec(fifo, recsize); + if (!nents) + BUG(); + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > fifo->in - fifo->out) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); } -EXPORT_SYMBOL(__kfifo_skip_generic); +EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); + +void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int len; + len = __kfifo_peek_n(fifo, recsize); + fifo->out += len + recsize; +} +EXPORT_SYMBOL(__kfifo_dma_out_finish_r); diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c780175..2dc3786349d1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -14,6 +14,8 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/freezer.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -35,6 +37,7 @@ struct kthread_create_info struct kthread { int should_stop; + void *data; struct completion exited; }; @@ -54,6 +57,19 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_data - return data value specified on kthread creation + * @task: kthread task in question + * + * Return the data value specified when kthread @task was created. + * The caller is responsible for ensuring the validity of @task when + * calling this function. + */ +void *kthread_data(struct task_struct *task) +{ + return to_kthread(task)->data; +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -64,6 +80,7 @@ static int kthread(void *_create) int ret; self.should_stop = 0; + self.data = data; init_completion(&self.exited); current->vfork_done = &self.exited; @@ -247,3 +264,150 @@ int kthreadd(void *unused) return 0; } + +/** + * kthread_worker_fn - kthread function to process kthread_worker + * @worker_ptr: pointer to initialized kthread_worker + * + * This function can be used as @threadfn to kthread_create() or + * kthread_run() with @worker_ptr argument pointing to an initialized + * kthread_worker. The started kthread will process work_list until + * the it is stopped with kthread_stop(). A kthread can also call + * this function directly after extra initialization. + * + * Different kthreads can be used for the same kthread_worker as long + * as there's only one kthread attached to it at any given time. A + * kthread_worker without an attached kthread simply collects queued + * kthread_works. + */ +int kthread_worker_fn(void *worker_ptr) +{ + struct kthread_worker *worker = worker_ptr; + struct kthread_work *work; + + WARN_ON(worker->task); + worker->task = current; +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock_irq(&worker->lock); + worker->task = NULL; + spin_unlock_irq(&worker->lock); + return 0; + } + + work = NULL; + spin_lock_irq(&worker->lock); + if (!list_empty(&worker->work_list)) { + work = list_first_entry(&worker->work_list, + struct kthread_work, node); + list_del_init(&work->node); + } + spin_unlock_irq(&worker->lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->func(work); + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ + work->done_seq = work->queue_seq; + smp_mb(); /* mb worker-b1 paired with flush-b0 */ + if (atomic_read(&work->flushing)) + wake_up_all(&work->done); + } else if (!freezing(current)) + schedule(); + + try_to_freeze(); + goto repeat; +} +EXPORT_SYMBOL_GPL(kthread_worker_fn); + +/** + * queue_kthread_work - queue a kthread_work + * @worker: target kthread_worker + * @work: kthread_work to queue + * + * Queue @work to work processor @task for async execution. @task + * must have been created with kthread_worker_create(). Returns %true + * if @work was successfully queued, %false if it was already pending. + */ +bool queue_kthread_work(struct kthread_worker *worker, + struct kthread_work *work) +{ + bool ret = false; + unsigned long flags; + + spin_lock_irqsave(&worker->lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &worker->work_list); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_kthread_work); + +/** + * flush_kthread_work - flush a kthread_work + * @work: work to flush + * + * If @work is queued or executing, wait for it to finish execution. + */ +void flush_kthread_work(struct kthread_work *work) +{ + int seq = work->queue_seq; + + atomic_inc(&work->flushing); + + /* + * mb flush-b0 paired with worker-b1, to make sure either + * worker sees the above increment or we see done_seq update. + */ + smp_mb__after_atomic_inc(); + + /* A - B <= 0 tests whether B is in front of A regardless of overflow */ + wait_event(work->done, seq - work->done_seq <= 0); + atomic_dec(&work->flushing); + + /* + * rmb flush-b1 paired with worker-b0, to make sure our caller + * sees every change made by work->func(). + */ + smp_mb__after_atomic_dec(); +} +EXPORT_SYMBOL_GPL(flush_kthread_work); + +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + +/** + * flush_kthread_worker - flush all current works on a kthread_worker + * @worker: worker to flush + * + * Wait until all currently executing or pending works on @worker are + * finished. + */ +void flush_kthread_worker(struct kthread_worker *worker) +{ + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + + queue_kthread_work(worker, &fwork.work); + wait_for_completion(&fwork.done); +} +EXPORT_SYMBOL_GPL(flush_kthread_worker); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37b..f2852a510232 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) diff --git a/kernel/module.c b/kernel/module.c index 6c562828c85c..d0b5f8db11b4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,6 +1,6 @@ /* Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb) } EXPORT_SYMBOL(unregister_module_notifier); +struct load_info { + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long *strmap; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + /* We require a truly strong try_module_get(): 0 means failure due to ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) @@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code) EXPORT_SYMBOL(__module_put_and_exit); /* Find a module section: 0 means not found. */ -static unsigned int find_sec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, - const char *name) +static unsigned int find_sec(const struct load_info *info, const char *name) { unsigned int i; - for (i = 1; i < hdr->e_shnum; i++) + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; /* Alloc bit cleared means "ignore it." */ - if ((sechdrs[i].sh_flags & SHF_ALLOC) - && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) return i; + } return 0; } /* Find a module section, or NULL. */ -static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, - const char *secstrings, const char *name) +static void *section_addr(const struct load_info *info, const char *name) { /* Section 0 has sh_addr 0. */ - return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; } /* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, +static void *section_objs(const struct load_info *info, const char *name, size_t object_size, unsigned int *num) { - unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + unsigned int sec = find_sec(info, name); /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = sechdrs[sec].sh_size / object_size; - return (void *)sechdrs[sec].sh_addr; + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; } /* Provided by the linker */ @@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, unsigned int symnum, void *data), void *data) { struct module *mod; - const struct symsearch arr[] = { + static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, @@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod, mod->percpu = __alloc_reserved_percpu(size, align); if (!mod->percpu) { printk(KERN_WARNING - "Could not allocate %lu bytes percpu data\n", size); + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, size); return -ENOMEM; } mod->percpu_size = size; @@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod) free_percpu(mod->percpu); } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { - return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); + return find_sec(info, ".data..percpu"); } static void percpu_modcopy(struct module *mod, @@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod, static inline void percpu_modfree(struct module *mod) { } -static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { return 0; } @@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); /* Init the unload section of the module. */ -static void module_unload_init(struct module *mod) +static int module_unload_init(struct module *mod) { - int cpu; + mod->refptr = alloc_percpu(struct module_ref); + if (!mod->refptr) + return -ENOMEM; INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); - for_each_possible_cpu(cpu) { - per_cpu_ptr(mod->refptr, cpu)->incs = 0; - per_cpu_ptr(mod->refptr, cpu)->decs = 0; - } /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; + + return 0; } /* Does a already use b? */ @@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); + + free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b) } EXPORT_SYMBOL_GPL(ref_module); -static inline void module_unload_init(struct module *mod) +static inline int module_unload_init(struct module *mod) { + return 0; } #endif /* CONFIG_MODULE_UNLOAD */ @@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, #endif /* CONFIG_MODVERSIONS */ /* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, - unsigned int versindex, +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, const char *name, - struct module *mod, char ownername[]) { struct module *owner; @@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, if (!sym) goto unlock; - if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, + owner)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -1087,21 +1097,20 @@ unlock: return sym; } -static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *name, - struct module *mod) +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) { const struct kernel_symbol *ksym; - char ownername[MODULE_NAME_LEN]; + char owner[MODULE_NAME_LEN]; if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, - mod, ownername)) || - PTR_ERR(ksym) != -EBUSY, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, ownername); + mod->name, owner); } return ksym; } @@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> */ -#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) +#ifdef CONFIG_SYSFS +#ifdef CONFIG_KALLSYMS static inline bool sect_empty(const Elf_Shdr *sect) { return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; @@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; @@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, struct attribute **gattr; /* Count loaded sections and allocate structures */ - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < nsect; i++) { - if (sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) continue; - sattr->address = sechdrs[i].sh_addr; - sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + sattr->address = sec->sh_addr; + sattr->name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); if (sattr->name == NULL) goto out; @@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; @@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i]) && - (sechdrs[i].sh_type == SHT_NOTE)) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) ++notes; if (notes == 0) @@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < nsect; ++i) { - if (sect_empty(&sechdrs[i])) + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) continue; - if (sechdrs[i].sh_type == SHT_NOTE) { + if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; - nattr->size = sechdrs[i].sh_size; - nattr->private = (void *) sechdrs[i].sh_addr; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; nattr->read = module_notes_read; ++nattr; } @@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod) #else -static inline void add_sect_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) { } @@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) { } static inline void remove_notes_attrs(struct module *mod) { } -#endif +#endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_SYSFS static void add_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD @@ -1439,6 +1447,7 @@ out: } static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { @@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod, goto out_unreg_param; add_usage_links(mod); + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1479,33 +1490,26 @@ out: static void mod_sysfs_fini(struct module *mod) { + remove_notes_attrs(mod); + remove_sect_attrs(mod); kobject_put(&mod->mkobj.kobj); } -#else /* CONFIG_SYSFS */ - -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} +#else /* !CONFIG_SYSFS */ -static inline int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { return 0; } -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) +static void mod_sysfs_fini(struct module *mod) { } -static void mod_sysfs_fini(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { } @@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod) #endif /* CONFIG_SYSFS */ -static void mod_kobject_remove(struct module *mod) +static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); module_remove_modinfo_attrs(mod); @@ -1545,9 +1549,7 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); mutex_unlock(&module_mutex); - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_remove(mod); + mod_sysfs_teardown(mod); /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1565,10 +1567,7 @@ static void free_module(struct module *mod) module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); -#if defined(CONFIG_MODULE_UNLOAD) - if (mod->refptr) - free_percpu(mod->refptr); -#endif + /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -1634,25 +1633,23 @@ static int verify_export_symbols(struct module *mod) } /* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - unsigned int versindex, - unsigned int pcpuindex, - struct module *mod) -{ - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; unsigned long secbase; - unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + unsigned int i; int ret = 0; const struct kernel_symbol *ksym; - for (i = 1; i < n; i++) { + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + switch (sym[i].st_shndx) { case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); + DEBUGP("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1665,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol_wait(sechdrs, versindex, - strtab + sym[i].st_name, - mod); + ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; @@ -1679,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, strtab + sym[i].st_name, - PTR_ERR(ksym)); + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; default: /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == pcpuindex) + if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); else - secbase = sechdrs[sym[i].st_shndx].sh_addr; + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; break; } @@ -1698,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + /* Additional bytes needed by arch in front of individual sections */ unsigned int __weak arch_mod_section_prepend(struct module *mod, unsigned int section) @@ -1722,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size, might -- code, read-only data, read-write data, small data. Tally sizes, and place the offsets into sh_entsize fields: high bit means it belongs in init. */ -static void layout_sections(struct module *mod, - const Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static void layout_sections(struct module *mod, struct load_info *info) { static unsigned long const masks[][2] = { /* NOTE: all executable code must be the first section @@ -1738,21 +1758,22 @@ static void layout_sections(struct module *mod, }; unsigned int m, i; - for (i = 0; i < hdr->e_shnum; i++) - sechdrs[i].sh_entsize = ~0UL; + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; DEBUGP("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(secstrings + s->sh_name, ".init")) + || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", name); } if (m == 0) mod->core_text_size = mod->core_size; @@ -1760,17 +1781,18 @@ static void layout_sections(struct module *mod, DEBUGP("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(secstrings + s->sh_name, ".init")) + || !strstarts(sname, ".init")) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", sname); } if (m == 0) mod->init_text_size = mod->init_size; @@ -1809,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(Elf_Shdr *sechdrs, - unsigned int info, - const char *tag) +static char *get_modinfo(struct load_info *info, const char *tag) { char *p; unsigned int taglen = strlen(tag); - unsigned long size = sechdrs[info].sh_size; + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; - for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { + for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } -static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, - unsigned int infoindex) +static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) - attr->setup(mod, - get_modinfo(sechdrs, - infoindex, - attr->attr.name)); + attr->setup(mod, get_modinfo(info, attr->attr.name)); } } @@ -1876,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value, } /* As per nm */ -static char elf_type(const Elf_Sym *sym, - Elf_Shdr *sechdrs, - const char *secstrings, - struct module *mod) +static char elf_type(const Elf_Sym *sym, const struct load_info *info) { + const Elf_Shdr *sechdrs = info->sechdrs; + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) return 'v'; @@ -1910,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym, else return 'b'; } - if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { return 'n'; + } return '?'; } @@ -1936,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } -static unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static void layout_symtab(struct module *mod, struct load_info *info) { - unsigned long symoffs; - Elf_Shdr *symsect = sechdrs + symindex; - Elf_Shdr *strsect = sechdrs + strindex; + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - const char *strtab; unsigned int i, nsrc, ndst; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, - symindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + symsect->sh_name); + info->index.sym) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); - src = (void *)hdr + symsect->sh_offset; + src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - strtab = (void *)hdr + strsect->sh_offset; for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; - while(!__test_and_set_bit(j, strmap) && strtab[j]) + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) ++j; ++ndst; } /* Append room for core symbols at end of core part. */ - symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, - strindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + strsect->sh_name); + info->index.str) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); /* Append room for core symbols' strings at end of core part. */ - *pstroffs = mod->core_size; - __set_bit(0, strmap); - mod->core_size += bitmap_weight(strmap, strsect->sh_size); - - return symoffs; + info->stroffs = mod->core_size; + __set_bit(0, info->strmap); + mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -static void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - unsigned long *strmap) +static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; const Elf_Sym *src; Elf_Sym *dst; char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; + mod->symtab = (void *)symsec->sh_addr; + mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info - = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + symoffs; + mod->core_symtab = dst = mod->module_core + info->symoffs; src = mod->symtab; *dst = *src; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, sechdrs, shnum)) + if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + dst[ndst].st_name = bitmap_weight(info->strmap, + dst[ndst].st_name); ++ndst; } mod->core_num_syms = ndst; - mod->core_strtab = s = mod->module_core + stroffs; - for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) - if (test_bit(i, strmap)) + mod->core_strtab = s = mod->module_core + info->stroffs; + for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) + if (test_bit(i, info->strmap)) *++s = mod->strtab[i]; } #else -static inline unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static inline void layout_symtab(struct module *mod, struct load_info *info) { - return 0; } -static inline void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - const unsigned long *strmap) +static void add_kallsyms(struct module *mod, struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { + if (!debug) + return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) printk(KERN_ERR "dynamic debug error adding module: %s\n", @@ -2087,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size) } #ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { unsigned int i; /* only scan the sections containing data */ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - for (i = 1; i < hdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + for (i = 1; i < info->hdr->e_shnum; i++) { + const char *name = info->secstrings + info->sechdrs[i].sh_name; + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) continue; - if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 - && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + if (!strstarts(name, ".data") && !strstarts(name, ".bss")) continue; - kmemleak_scan_area((void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size, GFP_KERNEL); + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); } } #else -static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } #endif -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static noinline struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +/* Sets info->hdr and info->len. */ +static int copy_and_check(struct load_info *info, + const void __user *umod, unsigned long len, + const char __user *uargs) { + int err; Elf_Ehdr *hdr; - Elf_Shdr *sechdrs; - char *secstrings, *args, *modmagic, *strtab = NULL; - char *staging; - unsigned int i; - unsigned int symindex = 0; - unsigned int strindex = 0; - unsigned int modindex, versindex, infoindex, pcpuindex; - struct module *mod; - long err = 0; - void *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long symoffs, stroffs, *strmap; - void __percpu *percpu; - struct _ddebug *debug = NULL; - unsigned int num_debug = 0; - mm_segment_t old_fs; - - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); if (len < sizeof(*hdr)) - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return ERR_PTR(-ENOMEM); + return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; @@ -2153,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod, } /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ + weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(*sechdrs)) { + || hdr->e_shentsize != sizeof(Elf_Shdr)) { err = -ENOEXEC; goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) - goto truncated; + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } - /* Convenience variables */ - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - sechdrs[0].sh_addr = 0; + info->hdr = hdr; + info->len = len; + return 0; - for (i = 1; i < hdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_NOBITS - && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) - goto truncated; +free_hdr: + vfree(hdr); + return err; +} + +static void free_copy(struct load_info *info) +{ + vfree(info->hdr); +} + +static int rewrite_section_headers(struct load_info *info) +{ + unsigned int i; + + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (shdr->sh_type != SHT_NOBITS + && info->len < shdr->sh_offset + shdr->sh_size) { + printk(KERN_ERR "Module len %lu truncated\n", + info->len); + return -ENOEXEC; + } /* Mark all sections sh_addr with their address in the temporary image. */ - sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - /* Internal symbols and strings. */ - if (sechdrs[i].sh_type == SHT_SYMTAB) { - symindex = i; - strindex = sechdrs[i].sh_link; - strtab = (char *)hdr + sechdrs[strindex].sh_offset; - } #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; + if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } - modindex = find_sec(hdr, sechdrs, secstrings, - ".gnu.linkonce.this_module"); - if (!modindex) { + /* Track but don't keep modinfo and version sections. */ + info->index.vers = find_sec(info, "__versions"); + info->index.info = find_sec(info, ".modinfo"); + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Return the temporary module pointer (we'll replace it with the final + * one when we move the module sections around). + */ +static struct module *setup_load_info(struct load_info *info) +{ + unsigned int i; + int err; + struct module *mod; + + /* Set up the convenience variables */ + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + info->secstrings = (void *)info->hdr + + info->sechdrs[info->hdr->e_shstrndx].sh_offset; + + err = rewrite_section_headers(info); + if (err) + return ERR_PTR(err); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { printk(KERN_WARNING "No module found in object\n"); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ - mod = (void *)sechdrs[modindex].sh_addr; + mod = (void *)info->sechdrs[info->index.mod].sh_addr; - if (symindex == 0) { + if (info->index.sym == 0) { printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", mod->name); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } - versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); - infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); - pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); - - /* Don't keep modinfo and version sections. */ - sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(sechdrs, versindex, mod)) { - err = -ENOEXEC; - goto free_hdr; - } + if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + return ERR_PTR(-ENOEXEC); + + return mod; +} + +static int check_modinfo(struct module *mod, struct load_info *info) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; - modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); if (err) - goto free_hdr; - } else if (!same_magic(modmagic, vermagic, versindex)) { + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); - err = -ENOEXEC; - goto free_hdr; + return -ENOEXEC; } - staging = get_modinfo(sechdrs, infoindex, "staging"); - if (staging) { + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); } - /* Now copy in args */ - args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(args)) { - err = PTR_ERR(args); - goto free_hdr; - } + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); - strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) - * sizeof(long), GFP_KERNEL); - if (!strmap) { - err = -ENOMEM; - goto free_mod; - } + return 0; +} - mod->state = MODULE_STATE_COMING; +static void find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(info, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); - if (err < 0) - goto free_mod; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->unused_syms = section_objs(info, "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(info, "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); +#endif +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); +#endif - if (pcpuindex) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); - if (err) - goto free_mod; - sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - } - /* Keep this around for failure path. */ - percpu = mod_percpu(mod); +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(info, "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, hdr, sechdrs, secstrings); - symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, - secstrings, &stroffs, strmap); + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + info->debug = section_objs(info, "__verbose", + sizeof(*info->debug), &info->num_debug); +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2291,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod, * leak. */ kmemleak_not_leak(ptr); - if (!ptr) { - err = -ENOMEM; - goto free_percpu; - } + if (!ptr) + return -ENOMEM; + memset(ptr, 0, mod->core_size); mod->module_core = ptr; @@ -2307,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod, */ kmemleak_ignore(ptr); if (!ptr && mod->init_size) { - err = -ENOMEM; - goto free_core; + module_free(mod, mod->module_core); + return -ENOMEM; } memset(ptr, 0, mod->init_size); mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ DEBUGP("final section addresses:\n"); - for (i = 0; i < hdr->e_shnum; i++) { + for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (!(shdr->sh_flags & SHF_ALLOC)) continue; - if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) + if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->module_init - + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + sechdrs[i].sh_entsize; + dest = mod->module_core + shdr->sh_entsize; - if (sechdrs[i].sh_type != SHT_NOBITS) - memcpy(dest, (void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size); + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ - sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); - } - /* Module has been moved. */ - mod = (void *)sechdrs[modindex].sh_addr; - kmemleak_load_module(mod, hdr, sechdrs, secstrings); - -#if defined(CONFIG_MODULE_UNLOAD) - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) { - err = -ENOMEM; - goto free_init; + shdr->sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", + shdr->sh_addr, info->secstrings + shdr->sh_name); } -#endif - /* Now we've moved module, initialize linked lists, etc. */ - module_unload_init(mod); - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + return 0; +} +static int check_module_license_and_versions(struct module *mod) +{ /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -2363,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod, if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, sechdrs, infoindex); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, - mod); - if (err < 0) - goto cleanup; - - /* Now we've got everything in the final locations, we can - * find optional sections. */ - mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); - mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused_gpl"); -#endif -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); -#endif - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(hdr, sechdrs, secstrings, - "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(hdr, sechdrs, secstrings, - "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, - "__mcount_loc", - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2443,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod, || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { - err = try_to_force_load(mod, - "no versions for exported symbols"); - if (err) - goto cleanup; + return try_to_force_load(mod, + "no versions for exported symbols"); } #endif + return 0; +} - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { - const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(sechdrs[info].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(sechdrs, strtab, symindex, i,mod); - else if (sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(sechdrs, strtab, symindex, i, - mod); - if (err < 0) - goto cleanup; - } - - /* Set up and sort exception table */ - mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Finally, copy percpu area over. */ - percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, - sechdrs[pcpuindex].sh_size); - - add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, - symoffs, stroffs, secstrings, strmap); - kfree(strmap); - strmap = NULL; - - if (!mod->taints) - debug = section_objs(hdr, sechdrs, secstrings, "__verbose", - sizeof(*debug), &num_debug); - - err = module_finalize(hdr, sechdrs, mod); - if (err < 0) - goto cleanup; +static void flush_module_icache(const struct module *mod) +{ + mm_segment_t old_fs; /* flush the icache in correct context */ old_fs = get_fs(); @@ -2511,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod, (unsigned long)mod->module_core + mod->core_size); set_fs(old_fs); +} - mod->args = args; - if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); +static struct module *layout_and_allocate(struct load_info *info) +{ + /* Module within temporary copy. */ + struct module *mod; + Elf_Shdr *pcpusec; + int err; + + mod = setup_load_info(info); + if (IS_ERR(mod)) + return mod; + + err = check_modinfo(mod, info); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, mod); + if (err < 0) + goto out; + + pcpusec = &info->sechdrs[info->index.pcpu]; + if (pcpusec->sh_size) { + /* We have a special allocation for this section. */ + err = percpu_modalloc(mod, + pcpusec->sh_size, pcpusec->sh_addralign); + if (err) + goto out; + pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, info); + + info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) + * sizeof(long), GFP_KERNEL); + if (!info->strmap) { + err = -ENOMEM; + goto free_percpu; + } + layout_symtab(mod, info); + + /* Allocate and move to the final place */ + err = move_module(mod, info); + if (err) + goto free_strmap; + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; + +free_strmap: + kfree(info->strmap); +free_percpu: + percpu_modfree(mod); +out: + return ERR_PTR(err); +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + kfree(info->strmap); + percpu_modfree(mod); + module_free(mod, mod->module_init); + module_free(mod, mod->module_core); +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct load_info info = { NULL, }; + struct module *mod; + long err; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + /* Copy in the blobs from userspace, check they are vaguely sane. */ + err = copy_and_check(&info, umod, len, uargs); + if (err) + return ERR_PTR(err); + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(&info); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto free_module; + + /* Now we've got everything in the final locations, we can + * find optional sections. */ + find_module_sections(mod, &info); + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, &info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, &info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, &info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, &info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + /* Mark state as coming so strong_try_module_get() ignores us. */ + mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since @@ -2530,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } - if (debug) - dynamic_debug_setup(debug, num_debug); + /* This has to be done once we're sure module name is unique. */ + if (!mod->taints) + dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2541,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod, list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); + /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); + /* Link in to syfs. */ + err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); if (err < 0) goto unlink; - add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - - /* Get rid of temporary copy */ - vfree(hdr); - - trace_module_load(mod); + /* Get rid of temporary copy and strmap. */ + kfree(info.strmap); + free_copy(&info); /* Done! */ + trace_module_load(mod); return mod; unlink: @@ -2565,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); ddebug: - dynamic_debug_remove(debug); + if (!mod->taints) + dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); + kfree(mod->args); + free_arch_cleanup: module_arch_cleanup(mod); - cleanup: + free_modinfo: free_modinfo(mod); + free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) - free_percpu(mod->refptr); - free_init: -#endif - module_free(mod, mod->module_init); - free_core: - module_free(mod, mod->module_core); - /* mod will be freed with core. Don't access it beyond this line! */ - free_percpu: - free_percpu(percpu); - free_mod: - kfree(args); - kfree(strmap); - free_hdr: - vfree(hdr); + free_module: + module_deallocate(mod, &info); + free_copy: + free_copy(&info); return ERR_PTR(err); - - truncated: - printk(KERN_ERR "Module len %lu truncated\n", len); - err = -ENOEXEC; - goto free_hdr; } /* Call module constructors. */ diff --git a/kernel/padata.c b/kernel/padata.c index fdd8ae609ce3..751019415d23 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -26,18 +26,19 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; - target_cpu = cpumask_first(pd->cpumask); + target_cpu = cpumask_first(pd->cpumask.pcpu); for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask); + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); return target_cpu; } @@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *work) +static void padata_parallel_worker(struct work_struct *parallel_work) { - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, pwork); - pd = queue->pd; + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; pinst = pd->pinst; - spin_lock(&queue->parallel.lock); - list_replace_init(&queue->parallel.list, &local_list); - spin_unlock(&queue->parallel.lock); + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work) * @pinst: padata instance * @padata: object to be parallelized * @cb_cpu: cpu the serialization callback function will run on, - * must be in the cpumask of padata. + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel @@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu) { int target_cpu, err; - struct padata_queue *queue; + struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); pd = rcu_dereference(pinst->pd); - err = 0; - if (!(pinst->flags & PADATA_INIT)) + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) goto out; err = -EBUSY; @@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) goto out; - err = -EINVAL; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) - goto out; - - err = -EINPROGRESS; + err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = cb_cpu; @@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, padata->seq_nr = atomic_inc_return(&pd->seq_nr); target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->queue, target_cpu); + queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->pwork); + queue_work_on(target_cpu, pinst->wq, &queue->work); out: rcu_read_unlock_bh(); @@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus, empty, calc_seq_nr; - int seq_nr, next_nr, overrun, next_overrun; - struct padata_queue *queue, *next_queue; + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; - empty = 0; - next_nr = -1; - next_overrun = 0; - next_queue = NULL; - - num_cpus = cpumask_weight(pd->cpumask); - - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - reorder = &queue->reorder; - - /* - * Calculate the seq_nr of the object that should be - * next in this reorder queue. - */ - overrun = 0; - calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) - + queue->cpu_index; + num_cpus = cpumask_weight(pd->cpumask.pcpu); - if (unlikely(calc_seq_nr > pd->max_seq_nr)) { - calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; - overrun = 1; - } - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - seq_nr = padata->seq_nr; - BUG_ON(calc_seq_nr != seq_nr); - } else { - seq_nr = calc_seq_nr; - empty++; - } - - if (next_nr < 0 || seq_nr < next_nr - || (next_overrun && !overrun)) { - next_nr = seq_nr; - next_overrun = overrun; - next_queue = queue; - } + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; } padata = NULL; - if (empty == num_cpus) - goto out; - reorder = &next_queue->reorder; if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - if (unlikely(next_overrun)) { - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - atomic_set(&queue->num_obj, 0); - } - } + BUG_ON(next_nr != padata->seq_nr); spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); spin_unlock(&reorder->lock); - atomic_inc(&next_queue->num_obj); + pd->processed++; goto out; } - queue = per_cpu_ptr(pd->queue, smp_processor_id()); + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); if (queue->cpu_index == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; @@ -262,7 +231,7 @@ out: static void padata_reorder(struct parallel_data *pd) { struct padata_priv *padata; - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; /* @@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd) return; } - queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - spin_lock(&queue->serial.lock); - list_add_tail(&padata->list, &queue->serial.list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg) padata_reorder(pd); } -static void padata_serial_worker(struct work_struct *work) +static void padata_serial_worker(struct work_struct *serial_work) { - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, swork); - pd = queue->pd; + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; - spin_lock(&queue->serial.lock); - list_replace_init(&queue->serial.list, &local_list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) void padata_do_serial(struct padata_priv *padata) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; pd = padata->pd; cpu = get_cpu(); - queue = per_cpu_ptr(pd->queue, cpu); + pqueue = per_cpu_ptr(pd->pqueue, cpu); - spin_lock(&queue->reorder.lock); + spin_lock(&pqueue->reorder.lock); atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &queue->reorder.list); - spin_unlock(&queue->reorder.lock); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); put_cpu(); @@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -/* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *cpumask) +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int cpu, cpu_index, num_cpus; - struct padata_queue *queue; - struct parallel_data *pd; - - cpu_index = 0; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } - pd->queue = alloc_percpu(struct padata_queue); - if (!pd->queue) - goto err_free_pd; + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} - if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) - goto err_free_queue; +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} - cpumask_and(pd->cpumask, cpumask, cpu_active_mask); +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} - queue->pd = pd; +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; - queue->cpu_index = cpu_index; + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; cpu_index++; - INIT_LIST_HEAD(&queue->reorder.list); - INIT_LIST_HEAD(&queue->parallel.list); - INIT_LIST_HEAD(&queue->serial.list); - spin_lock_init(&queue->reorder.lock); - spin_lock_init(&queue->parallel.lock); - spin_lock_init(&queue->serial.lock); - - INIT_WORK(&queue->pwork, padata_parallel_worker); - INIT_WORK(&queue->swork, padata_serial_worker); - atomic_set(&queue->num_obj, 0); + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); } - num_cpus = cpumask_weight(pd->cpumask); - pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} + +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; + + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + + padata_init_pqueues(pd); + padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); @@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, return pd; -err_free_queue: - free_percpu(pd->queue); +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); err_free_pd: kfree(pd); err: @@ -456,8 +464,10 @@ err: static void padata_free_pd(struct parallel_data *pd) { - free_cpumask_var(pd->cpumask); - free_percpu(pd->queue); + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); kfree(pd); } @@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd) static void padata_flush_queues(struct parallel_data *pd) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->pwork); + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); } del_timer_sync(&pd->timer); @@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd) if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->swork); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); } BUG_ON(atomic_read(&pd->refcnt) != 0); } +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + /* Replace the internal control stucture with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; pinst->flags |= PADATA_RESET; @@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst, synchronize_rcu(); + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + padata_flush_queues(pd_old); padata_free_pd(pd_old); + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + pinst->flags &= ~PADATA_RESET; } /** - * padata_set_cpumask - set the cpumask that padata should use + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. * - * @pinst: padata instance - * @cpumask: the cpumask to use + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. */ -int padata_set_cpumask(struct padata_instance *pinst, - cpumask_var_t cpumask) +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) { + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); + +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); + + +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; struct parallel_data *pd; - int err = 0; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; + } + + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + padata_replace(pinst, pd); + + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; mutex_lock(&pinst->lock); + get_online_cpus(); + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); get_online_cpus(); - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) { - err = -ENOMEM; - goto out; + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; } - cpumask_copy(pinst->cpumask, cpumask); - - padata_replace(pinst, pd); + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: put_online_cpus(); - mutex_unlock(&pinst->lock); return err; @@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) struct parallel_data *pd; if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); } return 0; } -/** - * padata_add_cpu - add a cpu to the padata cpumask + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_add_cpu(struct padata_instance *pinst, int cpu) + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_set_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_add_cpu(pinst, cpu); put_online_cpus(); @@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + struct parallel_data *pd = NULL; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; @@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } -/** - * padata_remove_cpu - remove a cpu from the padata cpumask + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu) +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_clear_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_remove_cpu(pinst, cpu); put_online_cpus(); @@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu); * * @pinst: padata instance to start */ -void padata_start(struct padata_instance *pinst) +int padata_start(struct padata_instance *pinst) { + int err = 0; + mutex_lock(&pinst->lock); - pinst->flags |= PADATA_INIT; + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + mutex_unlock(&pinst->lock); + + return err; } EXPORT_SYMBOL(padata_start); @@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start); void padata_stop(struct padata_instance *pinst) { mutex_lock(&pinst->lock); - pinst->flags &= ~PADATA_INIT; + __padata_stop(pinst); mutex_unlock(&pinst->lock); } EXPORT_SYMBOL(padata_stop); #ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + static int padata_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_add_cpu(pinst, cpu); @@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_remove_cpu(pinst, cpu); @@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_remove_cpu(pinst, cpu); @@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_add_cpu(pinst, cpu); @@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb, } #endif +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); + +/* + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + /** - * padata_alloc - allocate and initialize a padata instance + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. * - * @cpumask: cpumask that padata uses for parallelization * @wq: workqueue to use for the allocated padata instance */ -struct padata_instance *padata_alloc(const struct cpumask *cpumask, - struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd; + struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) goto err; get_online_cpus(); - - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); + goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; - if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) - goto err_free_pd; + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; - cpumask_copy(pinst->cpumask, cpumask); + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); pinst->flags = 0; @@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, put_online_cpus(); + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); return pinst; -err_free_pd: - padata_free_pd(pd); +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); put_online_cpus(); @@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc); */ void padata_free(struct padata_instance *pinst) { - padata_stop(pinst); - - synchronize_rcu(); - -#ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); -#endif - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); - - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask); - kfree(pinst); + kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c index 3b16cd93fa7d..4c13b1a88ebb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,9 @@ #include <linux/nmi.h> #include <linux/dmi.h> +#define PANIC_TIMER_STEP 100 +#define PANIC_BLINK_SPD 18 + int panic_on_oops; static unsigned long tainted_mask; static int pause_on_oops; @@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -/* Returns how long it waited in ms */ -long (*panic_blink)(long time); -EXPORT_SYMBOL(panic_blink); - -static void panic_blink_one_second(void) +static long no_blink(int state) { - static long i = 0, end; - - if (panic_blink) { - end = i + MSEC_PER_SEC; - - while (i < end) { - i += panic_blink(i); - mdelay(1); - i++; - } - } else { - /* - * When running under a hypervisor a small mdelay may get - * rounded up to the hypervisor timeslice. For example, with - * a 1ms in 10ms hypervisor timeslice we might inflate a - * mdelay(1) loop by 10x. - * - * If we have nothing to blink, spin on 1 second calls to - * mdelay to avoid this. - */ - mdelay(MSEC_PER_SEC); - } + return 0; } +/* Returns how long it waited in ms */ +long (*panic_blink)(int state); +EXPORT_SYMBOL(panic_blink); + /** * panic - halt the system * @fmt: The text string to print @@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...) { static char buf[1024]; va_list args; - long i; + long i, i_next = 0; + int state = 0; /* * It's possible to come here directly from a panic-assertion and @@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...) bust_spinlocks(0); + if (!panic_blink) + panic_blink = no_blink; + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. @@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...) */ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); - for (i = 0; i < panic_timeout; i++) { + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); - panic_blink_one_second(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); } /* * This will not be a clean reboot, with everything @@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif local_irq_enable(); - while (1) { + for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); - panic_blink_one_second(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); } } @@ -344,7 +338,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -static void print_oops_end_marker(void) +void print_oops_end_marker(void) { init_oops_id(); printk(KERN_WARNING "---[ end trace %016llx ]---\n", diff --git a/kernel/params.c b/kernel/params.c index 0b30ecd53a52..08107d181758 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -31,6 +31,42 @@ #define DEBUGP(fmt, a...) #endif +/* Protects all parameters, and incidentally kmalloced_param list. */ +static DEFINE_MUTEX(param_lock); + +/* This just allows us to keep track of which parameters are kmalloced. */ +struct kmalloced_param { + struct list_head list; + char val[]; +}; +static LIST_HEAD(kmalloced_params); + +static void *kmalloc_parameter(unsigned int size) +{ + struct kmalloced_param *p; + + p = kmalloc(sizeof(*p) + size, GFP_KERNEL); + if (!p) + return NULL; + + list_add(&p->list, &kmalloced_params); + return p->val; +} + +/* Does nothing if parameter wasn't kmalloced above. */ +static void maybe_kfree_parameter(void *param) +{ + struct kmalloced_param *p; + + list_for_each_entry(p, &kmalloced_params, list) { + if (p->val == param) { + list_del(&p->list); + kfree(p); + break; + } + } +} + static inline char dash2underscore(char c) { if (c == '-') @@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname) static int parse_one(char *param, char *val, - struct kernel_param *params, + const struct kernel_param *params, unsigned num_params, int (*handle_unknown)(char *param, char *val)) { unsigned int i; + int err; /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { + /* Noone handled NULL, so do it here. */ + if (!val && params[i].ops->set != param_set_bool) + return -EINVAL; DEBUGP("They are equal! Calling %p\n", - params[i].set); - return params[i].set(val, ¶ms[i]); + params[i].ops->set); + mutex_lock(¶m_lock); + err = params[i].ops->set(val, ¶ms[i]); + mutex_unlock(¶m_lock); + return err; } } @@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val) /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ int parse_args(const char *name, char *args, - struct kernel_param *params, + const struct kernel_param *params, unsigned num, int (*unknown)(char *param, char *val)) { @@ -176,22 +219,29 @@ int parse_args(const char *name, /* Lazy bastard, eh? */ #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ - int param_set_##name(const char *val, struct kernel_param *kp) \ + int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ tmptype l; \ int ret; \ \ - if (!val) return -EINVAL; \ ret = strtolfn(val, 0, &l); \ if (ret == -EINVAL || ((type)l != l)) \ return -EINVAL; \ *((type *)kp->arg) = l; \ return 0; \ } \ - int param_get_##name(char *buffer, struct kernel_param *kp) \ + int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ return sprintf(buffer, format, *((type *)kp->arg)); \ - } + } \ + struct kernel_param_ops param_ops_##name = { \ + .set = param_set_##name, \ + .get = param_get_##name, \ + }; \ + EXPORT_SYMBOL(param_set_##name); \ + EXPORT_SYMBOL(param_get_##name); \ + EXPORT_SYMBOL(param_ops_##name) + STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); @@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); -int param_set_charp(const char *val, struct kernel_param *kp) +int param_set_charp(const char *val, const struct kernel_param *kp) { - if (!val) { - printk(KERN_ERR "%s: string parameter expected\n", - kp->name); - return -EINVAL; - } - if (strlen(val) > 1024) { printk(KERN_ERR "%s: string parameter too long\n", kp->name); return -ENOSPC; } - /* This is a hack. We can't need to strdup in early boot, and we + maybe_kfree_parameter(*(char **)kp->arg); + + /* This is a hack. We can't kmalloc in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - *(char **)kp->arg = kstrdup(val, GFP_KERNEL); + *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); if (!*(char **)kp->arg) return -ENOMEM; + strcpy(*(char **)kp->arg, val); } else *(const char **)kp->arg = val; return 0; } +EXPORT_SYMBOL(param_set_charp); -int param_get_charp(char *buffer, struct kernel_param *kp) +int param_get_charp(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%s", *((char **)kp->arg)); } +EXPORT_SYMBOL(param_get_charp); + +static void param_free_charp(void *arg) +{ + maybe_kfree_parameter(*((char **)arg)); +} + +struct kernel_param_ops param_ops_charp = { + .set = param_set_charp, + .get = param_get_charp, + .free = param_free_charp, +}; +EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ -int param_set_bool(const char *val, struct kernel_param *kp) +int param_set_bool(const char *val, const struct kernel_param *kp) { bool v; @@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp) *(int *)kp->arg = v; return 0; } +EXPORT_SYMBOL(param_set_bool); -int param_get_bool(char *buffer, struct kernel_param *kp) +int param_get_bool(char *buffer, const struct kernel_param *kp) { bool val; if (kp->flags & KPARAM_ISBOOL) @@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp) /* Y and N chosen as being relatively non-coder friendly */ return sprintf(buffer, "%c", val ? 'Y' : 'N'); } +EXPORT_SYMBOL(param_get_bool); + +struct kernel_param_ops param_ops_bool = { + .set = param_set_bool, + .get = param_get_bool, +}; +EXPORT_SYMBOL(param_ops_bool); /* This one must be bool. */ -int param_set_invbool(const char *val, struct kernel_param *kp) +int param_set_invbool(const char *val, const struct kernel_param *kp) { int ret; bool boolval; @@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp) *(bool *)kp->arg = !boolval; return ret; } +EXPORT_SYMBOL(param_set_invbool); -int param_get_invbool(char *buffer, struct kernel_param *kp) +int param_get_invbool(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); } +EXPORT_SYMBOL(param_get_invbool); + +struct kernel_param_ops param_ops_invbool = { + .set = param_set_invbool, + .get = param_get_invbool, +}; +EXPORT_SYMBOL(param_ops_invbool); /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, - int (*set)(const char *, struct kernel_param *kp), + int (*set)(const char *, const struct kernel_param *kp), u16 flags, unsigned int *num) { @@ -309,12 +386,6 @@ static int param_array(const char *name, kp.arg = elem; kp.flags = flags; - /* No equals sign? */ - if (!val) { - printk(KERN_ERR "%s: expects arguments\n", name); - return -EINVAL; - } - *num = 0; /* We expect a comma-separated list of values. */ do { @@ -330,6 +401,7 @@ static int param_array(const char *name, /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; + BUG_ON(!mutex_is_locked(¶m_lock)); ret = set(val, &kp); if (ret != 0) @@ -347,17 +419,17 @@ static int param_array(const char *name, return 0; } -int param_array_set(const char *val, struct kernel_param *kp) +static int param_array_set(const char *val, const struct kernel_param *kp) { const struct kparam_array *arr = kp->arr; unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, kp->flags, + arr->elemsize, arr->ops->set, kp->flags, arr->num ?: &temp_num); } -int param_array_get(char *buffer, struct kernel_param *kp) +static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; @@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp) if (i) buffer[off++] = ','; p.arg = arr->elem + arr->elemsize * i; - ret = arr->get(buffer + off, &p); + BUG_ON(!mutex_is_locked(¶m_lock)); + ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; off += ret; @@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp) return off; } -int param_set_copystring(const char *val, struct kernel_param *kp) +static void param_array_free(void *arg) +{ + unsigned int i; + const struct kparam_array *arr = arg; + + if (arr->ops->free) + for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) + arr->ops->free(arr->elem + arr->elemsize * i); +} + +struct kernel_param_ops param_array_ops = { + .set = param_array_set, + .get = param_array_get, + .free = param_array_free, +}; +EXPORT_SYMBOL(param_array_ops); + +int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - if (!val) { - printk(KERN_ERR "%s: missing param set value\n", kp->name); - return -EINVAL; - } if (strlen(val)+1 > kps->maxlen) { printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); @@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp) strcpy(kps->string, val); return 0; } +EXPORT_SYMBOL(param_set_copystring); -int param_get_string(char *buffer, struct kernel_param *kp) +int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; return strlcpy(buffer, kps->string, kps->maxlen); } +EXPORT_SYMBOL(param_get_string); + +struct kernel_param_ops param_ops_string = { + .set = param_set_copystring, + .get = param_get_string, +}; +EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ #define to_module_attr(n) container_of(n, struct module_attribute, attr) @@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[]; struct param_attribute { struct module_attribute mattr; - struct kernel_param *param; + const struct kernel_param *param; }; struct module_param_attrs @@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, int count; struct param_attribute *attribute = to_param_attr(mattr); - if (!attribute->param->get) + if (!attribute->param->ops->get) return -EPERM; - count = attribute->param->get(buf, attribute->param); + mutex_lock(¶m_lock); + count = attribute->param->ops->get(buf, attribute->param); + mutex_unlock(¶m_lock); if (count > 0) { strcat(buf, "\n"); ++count; @@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr, int err; struct param_attribute *attribute = to_param_attr(mattr); - if (!attribute->param->set) + if (!attribute->param->ops->set) return -EPERM; - err = attribute->param->set(buf, attribute->param); + mutex_lock(¶m_lock); + err = attribute->param->ops->set(buf, attribute->param); + mutex_unlock(¶m_lock); if (!err) return len; return err; @@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #endif #ifdef CONFIG_SYSFS +void __kernel_param_lock(void) +{ + mutex_lock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_lock); + +void __kernel_param_unlock(void) +{ + mutex_unlock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_unlock); + /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject @@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, * if there's an error. */ static __modinit int add_sysfs_param(struct module_kobject *mk, - struct kernel_param *kp, + const struct kernel_param *kp, const char *name) { struct module_param_attrs *new; @@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk) * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, - struct kernel_param *kparam, + const struct kernel_param *kparam, unsigned int num_params) { int i, err; @@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - /* FIXME: This should free kmalloced charp parameters. It doesn't. */ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); } static void __init kernel_add_sysfs_param(const char *name, @@ -768,28 +882,3 @@ static int __init param_sysfs_init(void) subsys_initcall(param_sysfs_init); #endif /* CONFIG_SYSFS */ - -EXPORT_SYMBOL(param_set_byte); -EXPORT_SYMBOL(param_get_byte); -EXPORT_SYMBOL(param_set_short); -EXPORT_SYMBOL(param_get_short); -EXPORT_SYMBOL(param_set_ushort); -EXPORT_SYMBOL(param_get_ushort); -EXPORT_SYMBOL(param_set_int); -EXPORT_SYMBOL(param_get_int); -EXPORT_SYMBOL(param_set_uint); -EXPORT_SYMBOL(param_get_uint); -EXPORT_SYMBOL(param_set_long); -EXPORT_SYMBOL(param_get_long); -EXPORT_SYMBOL(param_set_ulong); -EXPORT_SYMBOL(param_get_ulong); -EXPORT_SYMBOL(param_set_charp); -EXPORT_SYMBOL(param_get_charp); -EXPORT_SYMBOL(param_set_bool); -EXPORT_SYMBOL(param_get_bool); -EXPORT_SYMBOL(param_set_invbool); -EXPORT_SYMBOL(param_get_invbool); -EXPORT_SYMBOL(param_array_set); -EXPORT_SYMBOL(param_array_get); -EXPORT_SYMBOL(param_set_copystring); -EXPORT_SYMBOL(param_get_string); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f416aef242c3..0d38f27ad885 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c132d26..d55c6fb8d087 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid) atomic_inc(&map->nr_free); } +/* + * If we started walking pids at 'base', is 'a' seen before 'b'? + */ +static int pid_before(int base, int a, int b) +{ + /* + * This is the same as saying + * + * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT + * and that mapping orders 'a' and 'b' with respect to 'base'. + */ + return (unsigned)(a - base) < (unsigned)(b - base); +} + +/* + * We might be racing with someone else trying to set pid_ns->last_pid. + * We want the winner to have the "later" value, because if the + * "earlier" value prevails, then a pid may get reused immediately. + * + * Since pids rollover, it is not sufficient to just pick the bigger + * value. We have to consider where we started counting from. + * + * 'base' is the value of pid_ns->last_pid that we observed when + * we started looking for a pid. + * + * 'pid' is the pid that we eventually found. + */ +static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) +{ + int prev; + int last_write = base; + do { + prev = last_write; + last_write = cmpxchg(&pid_ns->last_pid, prev, pid); + } while ((prev != last_write) && (pid_before(base, last_write, pid))); +} + static int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; @@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; + /* + * If last_pid points into the middle of the map->page we + * want to scan this bitmap block twice, the second time + * we start with offset == 0 (or RESERVED_PIDS). + */ + max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); @@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pid_ns->last_pid = pid; + set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); pid = mk_pid(pid_ns, map, offset); - /* - * find_next_offset() found a bit, the pid from it - * is in-bounds, and if we fell back to the last - * bitmap block and the final block was the same - * as the starting point, pid is before last_pid. - */ - } while (offset < BITS_PER_PAGE && pid < pid_max && - (i != max_scan || pid < last || - !((last+1) & BITS_PER_PAGE_MASK))); + } while (offset < BITS_PER_PAGE && pid < pid_max); } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f737a33..996a4dec5f96 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -48,59 +48,49 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_request_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - int pm_qos_class; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct pm_qos_request_list requests; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; -static void update_target(int pm_qos_class) + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct pm_qos_request_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[pm_qos_class]->default_value; - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requests.list, list) { - extreme_value = pm_qos_array[pm_qos_class]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != - extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[pm_qos_class]->target_value, - extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, - atomic_read(&pm_qos_array[pm_qos_class]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain( - pm_qos_array[pm_qos_class]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + /** * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us @@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request); * element as a handle for use in updating and removal. Call needs to save * this handle for later use. */ -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct pm_qos_request_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->pm_qos_class = pm_qos_class; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requests.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - - return dep; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * Attempts are made to make this code callable on hot code paths. */ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, - s32 new_value) + s32 new_value) { - unsigned long flags; - int pending_update = 0; s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; - if (pm_qos_req) { /*guard against callers passing in null */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->value) { - pending_update = 1; - pm_qos_req->value = temp; - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_req->pm_qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); */ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - int qos_class; + struct pm_qos_object *o; if (pm_qos_req == NULL) return; /* silent return to keep pcm code cleaner */ - qos_class = pm_qos_req->pm_qos_class; - spin_lock_irqsave(&pm_qos_lock, flags); - list_del(&pm_qos_req->list); - kfree(pm_qos_req); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *) pm_qos_add_request(pm_qos_class, - PM_QOS_DEFAULT_VALUE); + struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; if (filp->private_data) return 0; @@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) { struct pm_qos_request_list *req; - req = (struct pm_qos_request_list *)filp->private_data; + req = filp->private_data; pm_qos_remove_request(req); + kfree(req); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..6842eeba5879 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -16,13 +16,13 @@ * siglock protection since other code may update expiration cache as * well. */ -void update_rlimit_cpu(unsigned long rlim_new) +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { cputime_t cputime = secs_to_cputime(rlim_new); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); } static int check_clock(const clockid_t which_clock) @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - struct sighand_struct *sighand; - struct signal_struct *sig; + struct signal_struct *sig = tsk->signal; struct task_struct *t; - *times = INIT_CPUTIME; + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (!sighand) + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) goto out; - sig = tsk->signal; - t = tsk; do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += t->se.sum_exec_runtime; - - t = next_thread(t); - } while (t != tsk); - - times->utime = cputime_add(times->utime, sig->utime); - times->stime = cputime_add(times->stime, sig->stime); - times->sum_exec_runtime += sig->sum_sched_runtime; + } while_each_thread(tsk, t); out: rcu_read_unlock(); } @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { .utime = tsk->utime, @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - thread_group_cputimer(tsk, &group_sample); + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; - spin_lock(&tsk->sighand->siglock); + if (!lock_task_sighand(tsk, &flags)) + return; /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } if (timer_event_spec) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) { error = -EFAULT; @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); if (error) goto out; diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 97024fd40cd5..83bbc7c02df9 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916da4d5..c77963938bca 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. @@ -277,7 +277,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -288,8 +288,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + hibernation_freeze_swap(); saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) @@ -511,18 +514,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b21fc0..62b0bc6e4983 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (sscanf(buf, "%lu", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -236,6 +290,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab6..028a99598f49 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> +#include <linux/workqueue.h> /* * Timeout for stopping processes @@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only) struct task_struct *g, *p; unsigned long end_time; unsigned int todo; + bool wq_busy = false; struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; @@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only) do_gettimeofday(&start); end_time = jiffies + TIMEOUT; + + if (!sig_only) + freeze_workqueues_begin(); + while (true) { todo = 0; read_lock(&tasklist_lock); @@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); + + if (!sig_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + if (!todo || time_after(jiffies, end_time)) break; @@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only) */ printk("\n"); printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); + "(%d tasks refusing to freeze, wq_busy=%d):\n", + elapsed_csecs / 100, elapsed_csecs % 100, + todo - wq_busy, wq_busy); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); @@ -157,6 +173,7 @@ void thaw_processes(void) oom_killer_enable(); printk("Restarting tasks ... "); + thaw_workqueues(); thaw_tasks(true); thaw_tasks(false); schedule(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 25ce010e9f8b..5e7edfb05e66 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -1086,6 +1086,7 @@ void swsusp_free(void) buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernation_thaw_swap(); } /* Helper functions used for the shrinking of memory. */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f37cb7dd4402..7335952ee473 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb21778391..5d0059eed3e4 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,7 +4,7 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -32,7 +32,7 @@ /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(get_swap_for_hibernation(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_for_hibernation(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) ext = container_of(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_for_hibernation(swp_entry(swap, offset)); kfree(ext); } diff --git a/kernel/printk.c b/kernel/printk.c index 444b770c9595..8fe465ac008a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -37,6 +37,8 @@ #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/uaccess.h> @@ -985,6 +987,32 @@ void resume_console(void) } /** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int __cpuinit console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DYING: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + acquire_console_sem(); + release_console_sem(); + } + return NOTIFY_OK; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has @@ -1371,7 +1399,7 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); -static int __init disable_boot_consoles(void) +static int __init printk_late_init(void) { struct console *con; @@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void) unregister_console(con); } } + hotcpu_notifier(console_cpu_notify, 0); return 0; } -late_initcall(disable_boot_consoles); +late_initcall(printk_late_init); #if defined CONFIG_PRINTK @@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) chars = logged_chars; spin_unlock_irqrestore(&logbuf_lock, flags); - if (logged_chars > end) { - s1 = log_buf + log_buf_len - logged_chars + end; - l1 = logged_chars - end; + if (chars > end) { + s1 = log_buf + log_buf_len - chars + end; + l1 = chars - end; s2 = log_buf; l2 = end; @@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason) s1 = ""; l1 = 0; - s2 = log_buf + end - logged_chars; - l2 = logged_chars; + s2 = log_buf + end - chars; + l2 = chars; } if (!spin_trylock_irqsave(&dump_list_lock, flags)) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 74a3d693c196..f34d798ef4a2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data) } /* - * Detach all tasks we were using ptrace on. + * Detach all tasks we were using ptrace on. Called with tasklist held + * for writing, and returns with it held too. But note it can release + * and reacquire the lock. */ void exit_ptrace(struct task_struct *tracer) { struct task_struct *p, *n; LIST_HEAD(ptrace_dead); - write_lock_irq(&tasklist_lock); + if (likely(list_empty(&tracer->ptraced))) + return; + list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } - write_unlock_irq(&tasklist_lock); + write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&tracer->ptraced)); list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } + + write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) diff --git a/kernel/range.c b/kernel/range.c index 74e2e6114927..471b66acabb5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -7,10 +7,6 @@ #include <linux/range.h> -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { if (start >= end) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f5..4d169835fb36 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) } EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); #endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#else + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; +#endif + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd236..196ec02f8be0 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; } @@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, { unsigned long flags; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a5..2e2726d790b9 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -239,8 +239,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706f..d5bc43976c5a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/sched.c b/kernel/sched.c index 265cf3a2b5d8..41541d79e3c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include <asm/irq_regs.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -456,9 +457,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -int nohz_ratelimit(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 diff = rq->clock - rq->nohz_stamp; - - rq->nohz_stamp = rq->clock; - - return diff < (NSEC_PER_SEC / HZ) >> 1; -} - #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { @@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2399,6 +2431,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); calc_load_account_active(this_rq); } @@ -3426,7 +3574,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); @@ -3598,7 +3746,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3611,11 +3758,26 @@ need_resched_nonpreemptible: raw_spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3637,8 +3799,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3647,11 +3811,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -4441,12 +4602,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6096,7 +6277,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7356,10 +7543,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7604,6 +7789,9 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7617,6 +7805,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7661,8 +7853,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb3..52f1a149bfb1 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar <mingo@redhat.com> * Guillaume Chazarain <guichaz@gmail.com> * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc83..2722dc1b4138 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp) { - gfp_t gfp = GFP_KERNEL; int i; - if (bootmem) - gfp = GFP_NOWAIT; - memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) raw_spin_lock_init(&vec->lock); vec->count = 0; - if (!zalloc_cpumask_var(&vec->mask, gfp)) + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95be..9fc7d386fea4 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -27,7 +27,7 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00d..2e1b0d17dd9b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); + P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..806d1b227a21 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 > group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu) { - *balance = 0; - return; + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } - update_group_power(sd, this_cpu); - /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; @@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; } /** @@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try + * first, lower the sg capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) @@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; @@ -2726,8 +2850,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2854,7 +2992,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2898,7 +3036,8 @@ redo: schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3093,13 +3232,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif /* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3385,11 +3569,102 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (!rq->nr_running) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c6..d10c80ebb67a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; - if (!p->signal) - return; - /* max may change after cur was read, this will be fixed next tick */ soft = task_rlimit(p, RLIMIT_RTTIME); hard = task_rlimit_max(p, RLIMIT_RTTIME); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b0..25c2f962f6fc 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer; - struct signal_struct *sig; - - sig = tsk->signal; - /* see __exit_signal()->task_rq_unlock_wait() */ - barrier(); - if (unlikely(!sig)) - return; - - cputimer = &sig->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; diff --git a/kernel/signal.c b/kernel/signal.c index 906ae5a1779c..bded65187780 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) @@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c43645298..000000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Slow work debugging - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/seq_file.h> -#include "slow-work.h" - -#define ITERATOR_SHIFT (BITS_PER_LONG - 4) -#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) -#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) - -void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) -{ - seq_puts(m, "Slow-work: New thread"); -} - -/* - * Render the time mark field on a work item into a 5-char time with units plus - * a space - */ -static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) -{ - struct timespec now, diff; - - now = CURRENT_TIME; - diff = timespec_sub(now, work->mark); - - if (diff.tv_sec < 0) - seq_puts(m, " -ve "); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) - seq_printf(m, "%3luns ", diff.tv_nsec); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) - seq_printf(m, "%3luus ", diff.tv_nsec / 1000); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) - seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); - else if (diff.tv_sec <= 1) - seq_puts(m, " 1s "); - else if (diff.tv_sec < 60) - seq_printf(m, "%4lus ", diff.tv_sec); - else if (diff.tv_sec < 60 * 60) - seq_printf(m, "%4lum ", diff.tv_sec / 60); - else if (diff.tv_sec < 60 * 60 * 24) - seq_printf(m, "%4luh ", diff.tv_sec / 3600); - else - seq_puts(m, "exces "); -} - -/* - * Describe a slow work item for debugfs - */ -static int slow_work_runqueue_show(struct seq_file *m, void *v) -{ - struct slow_work *work; - struct list_head *p = v; - unsigned long id; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); - return 0; - case 2: - seq_puts(m, "=== ===== ================ == ===== ==========\n"); - return 0; - - case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: - id = (unsigned long) v - 3; - - read_lock(&slow_work_execs_lock); - work = slow_work_execs[id]; - if (work) { - smp_read_barrier_depends(); - - seq_printf(m, "%3lu %5d %16p %2lx ", - id, slow_work_pids[id], work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - } - read_unlock(&slow_work_execs_lock); - return 0; - - default: - work = list_entry(p, struct slow_work, link); - seq_printf(m, "%3s - %16p %2lx ", - work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", - work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - return 0; - } -} - -/* - * map the iterator to a work item - */ -static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) -{ - struct list_head *p; - unsigned long count, id; - - switch (*_pos >> ITERATOR_SHIFT) { - case 0x0: - if (*_pos == 0) - *_pos = 1; - if (*_pos < 3) - return (void *)(unsigned long) *_pos; - if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) - for (id = *_pos - 3; - id < SLOW_WORK_THREAD_LIMIT; - id++, (*_pos)++) - if (slow_work_execs[id]) - return (void *)(unsigned long) *_pos; - *_pos = 0x1UL << ITERATOR_SHIFT; - - case 0x1: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &slow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - - case 0x2: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &vslow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) -{ - spin_lock_irq(&slow_work_queue_lock); - return slow_work_runqueue_index(m, _pos); -} - -/* - * move to the next line - */ -static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) -{ - struct list_head *p = v; - unsigned long selector = *_pos >> ITERATOR_SHIFT; - - (*_pos)++; - switch (selector) { - case 0x0: - return slow_work_runqueue_index(m, _pos); - - case 0x1: - if (*_pos >> ITERATOR_SHIFT == 0x1) { - p = p->next; - if (p != &slow_work_queue) - return p; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - p = &vslow_work_queue; - - case 0x2: - if (*_pos >> ITERATOR_SHIFT == 0x2) { - p = p->next; - if (p != &vslow_work_queue) - return p; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * clean up after reading - */ -static void slow_work_runqueue_stop(struct seq_file *m, void *v) -{ - spin_unlock_irq(&slow_work_queue_lock); -} - -static const struct seq_operations slow_work_runqueue_ops = { - .start = slow_work_runqueue_start, - .stop = slow_work_runqueue_stop, - .next = slow_work_runqueue_next, - .show = slow_work_runqueue_show, -}; - -/* - * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents - */ -static int slow_work_runqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slow_work_runqueue_ops); -} - -const struct file_operations slow_work_runqueue_fops = { - .owner = THIS_MODULE, - .open = slow_work_runqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4f..000000000000 --- a/kernel/slow-work.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* Worker thread pool for slow items, such as filesystem lookups or mkdirs - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - * - * See Documentation/slow-work.txt - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/wait.h> -#include <linux/debugfs.h> -#include "slow-work.h" - -static void slow_work_cull_timeout(unsigned long); -static void slow_work_oom_timeout(unsigned long); - -#ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -static int slow_work_max_threads_sysctl(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -#endif - -/* - * The pool of threads has at least min threads in it as long as someone is - * using the facility, and may have as many as max. - * - * A portion of the pool may be processing very slow operations. - */ -static unsigned slow_work_min_threads = 2; -static unsigned slow_work_max_threads = 4; -static unsigned vslow_work_proportion = 50; /* % of threads that may process - * very slow work */ - -#ifdef CONFIG_SYSCTL -static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; -static const int slow_work_min_vslow = 1; -static const int slow_work_max_vslow = 99; - -ctl_table slow_work_sysctls[] = { - { - .procname = "min-threads", - .data = &slow_work_min_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_min_threads_sysctl, - .extra1 = (void *) &slow_work_min_min_threads, - .extra2 = &slow_work_max_threads, - }, - { - .procname = "max-threads", - .data = &slow_work_max_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_max_threads_sysctl, - .extra1 = &slow_work_min_threads, - .extra2 = (void *) &slow_work_max_max_threads, - }, - { - .procname = "vslow-percentage", - .data = &vslow_work_proportion, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &slow_work_min_vslow, - .extra2 = (void *) &slow_work_max_vslow, - }, - {} -}; -#endif - -/* - * The active state of the thread pool - */ -static atomic_t slow_work_thread_count; -static atomic_t vslow_work_executing_count; - -static bool slow_work_may_not_start_new_thread; -static bool slow_work_cull; /* cull a thread due to lack of activity */ -static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); -static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); -static struct slow_work slow_work_new_thread; /* new thread starter */ - -/* - * slow work ID allocation (use slow_work_queue_lock) - */ -static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - -/* - * Unregistration tracking to prevent put_ref() from disappearing during module - * unload - */ -#ifdef CONFIG_MODULES -static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; -static struct module *slow_work_unreg_module; -static struct slow_work *slow_work_unreg_work_item; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); -static DEFINE_MUTEX(slow_work_unreg_sync_lock); - -static void slow_work_set_thread_processing(int id, struct slow_work *work) -{ - if (work) - slow_work_thread_processing[id] = work->owner; -} -static void slow_work_done_thread_processing(int id, struct slow_work *work) -{ - struct module *module = slow_work_thread_processing[id]; - - slow_work_thread_processing[id] = NULL; - smp_mb(); - if (slow_work_unreg_work_item == work || - slow_work_unreg_module == module) - wake_up_all(&slow_work_unreg_wq); -} -static void slow_work_clear_thread_processing(int id) -{ - slow_work_thread_processing[id] = NULL; -} -#else -static void slow_work_set_thread_processing(int id, struct slow_work *work) {} -static void slow_work_done_thread_processing(int id, struct slow_work *work) {} -static void slow_work_clear_thread_processing(int id) {} -#endif - -/* - * Data for tracking currently executing items for indication through /proc - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; -pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; -DEFINE_RWLOCK(slow_work_execs_lock); -#endif - -/* - * The queues of work items and the lock governing access to them. These are - * shared between all the CPUs. It doesn't make sense to have per-CPU queues - * as the number of threads bears no relation to the number of CPUs. - * - * There are two queues of work items: one for slow work items, and one for - * very slow work items. - */ -LIST_HEAD(slow_work_queue); -LIST_HEAD(vslow_work_queue); -DEFINE_SPINLOCK(slow_work_queue_lock); - -/* - * The following are two wait queues that get pinged when a work item is placed - * on an empty queue. These allow work items that are hogging a thread by - * sleeping in a way that could be deferred to yield their thread and enqueue - * themselves. - */ -static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); -static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); - -/* - * The thread controls. A variable used to signal to the threads that they - * should exit when the queue is empty, a waitqueue used by the threads to wait - * for signals, and a completion set by the last thread to exit. - */ -static bool slow_work_threads_should_exit; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); -static DECLARE_COMPLETION(slow_work_last_thread_exited); - -/* - * The number of users of the thread pool and its lock. Whilst this is zero we - * have no threads hanging around, and when this reaches zero, we wait for all - * active or queued work items to complete and kill all the threads we do have. - */ -static int slow_work_user_count; -static DEFINE_MUTEX(slow_work_user_lock); - -static inline int slow_work_get_ref(struct slow_work *work) -{ - if (work->ops->get_ref) - return work->ops->get_ref(work); - - return 0; -} - -static inline void slow_work_put_ref(struct slow_work *work) -{ - if (work->ops->put_ref) - work->ops->put_ref(work); -} - -/* - * Calculate the maximum number of active threads in the pool that are - * permitted to process very slow work items. - * - * The answer is rounded up to at least 1, but may not equal or exceed the - * maximum number of the threads in the pool. This means we always have at - * least one thread that can process slow work items, and we always have at - * least one thread that won't get tied up doing so. - */ -static unsigned slow_work_calc_vsmax(void) -{ - unsigned vsmax; - - vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; - vsmax /= 100; - vsmax = max(vsmax, 1U); - return min(vsmax, slow_work_max_threads - 1); -} - -/* - * Attempt to execute stuff queued on a slow thread. Return true if we managed - * it, false if there was nothing to do. - */ -static noinline bool slow_work_execute(int id) -{ - struct slow_work *work = NULL; - unsigned vsmax; - bool very_slow; - - vsmax = slow_work_calc_vsmax(); - - /* see if we can schedule a new thread to be started if we're not - * keeping up with the work */ - if (!waitqueue_active(&slow_work_thread_wq) && - (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && - atomic_read(&slow_work_thread_count) < slow_work_max_threads && - !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - - /* find something to execute */ - spin_lock_irq(&slow_work_queue_lock); - if (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax) { - work = list_entry(vslow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - atomic_inc(&vslow_work_executing_count); - very_slow = true; - } else if (!list_empty(&slow_work_queue)) { - work = list_entry(slow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - very_slow = false; - } else { - very_slow = false; /* avoid the compiler warning */ - } - - slow_work_set_thread_processing(id, work); - if (work) { - slow_work_mark_time(work); - slow_work_begin_exec(id, work); - } - - spin_unlock_irq(&slow_work_queue_lock); - - if (!work) - return false; - - if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) - BUG(); - - /* don't execute if the work is in the process of being cancelled */ - if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) - work->ops->execute(work); - - if (very_slow) - atomic_dec(&vslow_work_executing_count); - clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); - - /* wake up anyone waiting for this work to be complete */ - wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); - - slow_work_end_exec(id, work); - - /* if someone tried to enqueue the item whilst we were executing it, - * then it'll be left unenqueued to avoid multiple threads trying to - * execute it simultaneously - * - * there is, however, a race between us testing the pending flag and - * getting the spinlock, and between the enqueuer setting the pending - * flag and getting the spinlock, so we use a deferral bit to tell us - * if the enqueuer got there first - */ - if (test_bit(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irq(&slow_work_queue_lock); - - if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && - test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) - goto auto_requeue; - - spin_unlock_irq(&slow_work_queue_lock); - } - - /* sort out the race between module unloading and put_ref() */ - slow_work_put_ref(work); - slow_work_done_thread_processing(id, work); - - return true; - -auto_requeue: - /* we must complete the enqueue operation - * - we transfer our ref on the item back to the appropriate queue - * - don't wake another thread up as we're awake already - */ - slow_work_mark_time(work); - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); - spin_unlock_irq(&slow_work_queue_lock); - slow_work_clear_thread_processing(id); - return true; -} - -/** - * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work - * work: The work item under execution that wants to sleep - * _timeout: Scheduler sleep timeout - * - * Allow a requeueable work item to sleep on a slow-work processor thread until - * that thread is needed to do some other work or the sleep is interrupted by - * some other event. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * False is returned if there is nothing on the queue; true is returned if the - * work item should be requeued - */ -bool slow_work_sleep_till_thread_needed(struct slow_work *work, - signed long *_timeout) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - - DEFINE_WAIT(wait); - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - if (!list_empty(queue)) - return true; - - add_wait_queue_exclusive(wfo_wq, &wait); - if (list_empty(queue)) - *_timeout = schedule_timeout(*_timeout); - finish_wait(wfo_wq, &wait); - - return !list_empty(queue); -} -EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); - -/** - * slow_work_enqueue - Schedule a slow work item for processing - * @work: The work item to queue - * - * Schedule a slow work item for processing. If the item is already undergoing - * execution, this guarantees not to re-enter the execution routine until the - * first execution finishes. - * - * The item is pinned by this function as it retains a reference to it, managed - * through the item operations. The item is unpinned once it has been - * executed. - * - * An item may hog the thread that is running it for a relatively large amount - * of time, sufficient, for example, to perform several lookup, mkdir, create - * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. - * - * Conversely, if a number of items are awaiting processing, it may take some - * time before any given item is given attention. The number of threads in the - * pool may be increased to deal with demand, but only up to a limit. - * - * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in - * the very slow queue, from which only a portion of the threads will be - * allowed to pick items to execute. This ensures that very slow items won't - * overly block ones that are just ordinarily slow. - * - * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is - * attempted queued) - */ -int slow_work_enqueue(struct slow_work *work) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - unsigned long flags; - int ret; - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - /* when honouring an enqueue request, we only promise that we will run - * the work function in the future; we do not promise to run it once - * per enqueue request - * - * we use the PENDING bit to merge together repeat requests without - * having to disable IRQs and take the spinlock, whilst still - * maintaining our promise - */ - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) - goto cancelled; - - /* we promise that we will not attempt to execute the work - * function in more than one thread simultaneously - * - * this, however, leaves us with a problem if we're asked to - * enqueue the work whilst someone is executing the work - * function as simply queueing the work immediately means that - * another thread may try executing it whilst it is already - * under execution - * - * to deal with this, we set the ENQ_DEFERRED bit instead of - * enqueueing, and the thread currently executing the work - * function will enqueue the work item when the work function - * returns and it has cleared the EXECUTING bit - */ - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - } else { - ret = slow_work_get_ref(work); - if (ret < 0) - goto failed; - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - wake_up(&slow_work_thread_wq); - - /* if someone who could be requeued is sleeping on a - * thread, then ask them to yield their thread */ - if (work->link.prev == queue) - wake_up(wfo_wq); - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - return 0; - -cancelled: - ret = -ECANCELED; -failed: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(slow_work_enqueue); - -static int slow_work_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * slow_work_cancel - Cancel a slow work item - * @work: The work item to cancel - * - * This function will cancel a previously enqueued work item. If we cannot - * cancel the work item, it is guarenteed to have run when this function - * returns. - */ -void slow_work_cancel(struct slow_work *work) -{ - bool wait = true, put = false; - - set_bit(SLOW_WORK_CANCELLING, &work->flags); - smp_mb(); - - /* if the work item is a delayed work item with an active timer, we - * need to wait for the timer to finish _before_ getting the spinlock, - * lest we deadlock against the timer routine - * - * the timer routine will leave DELAYED set if it notices the - * CANCELLING flag in time - */ - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - del_timer_sync(&dwork->timer); - } - - spin_lock_irq(&slow_work_queue_lock); - - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - /* the timer routine aborted or never happened, so we are left - * holding the timer's reference on the item and should just - * drop the pending flag and wait for any ongoing execution to - * finish */ - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - - BUG_ON(timer_pending(&dwork->timer)); - BUG_ON(!list_empty(&work->link)); - - clear_bit(SLOW_WORK_DELAYED, &work->flags); - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && - !list_empty(&work->link)) { - /* the link in the pending queue holds a reference on the item - * that we will need to release */ - list_del_init(&work->link); - wait = false; - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { - /* the executor is holding our only reference on the item, so - * we merely need to wait for it to finish executing */ - clear_bit(SLOW_WORK_PENDING, &work->flags); - } - - spin_unlock_irq(&slow_work_queue_lock); - - /* the EXECUTING flag is set by the executor whilst the spinlock is set - * and before the item is dequeued - so assuming the above doesn't - * actually dequeue it, simply waiting for the EXECUTING flag to be - * released here should be sufficient */ - if (wait) - wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, - TASK_UNINTERRUPTIBLE); - - clear_bit(SLOW_WORK_CANCELLING, &work->flags); - if (put) - slow_work_put_ref(work); -} -EXPORT_SYMBOL(slow_work_cancel); - -/* - * Handle expiry of the delay timer, indicating that a delayed slow work item - * should now be queued if not cancelled - */ -static void delayed_slow_work_timer(unsigned long data) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - struct slow_work *work = (struct slow_work *) data; - unsigned long flags; - bool queued = false, put = false, first = false; - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { - clear_bit(SLOW_WORK_DELAYED, &work->flags); - - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - /* we discard the reference the timer was holding in - * favour of the one the executor holds */ - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - put = true; - } else { - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - queued = true; - if (work->link.prev == queue) - first = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - if (put) - slow_work_put_ref(work); - if (first) - wake_up(wfo_wq); - if (queued) - wake_up(&slow_work_thread_wq); -} - -/** - * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing - * @dwork: The delayed work item to queue - * @delay: When to start executing the work, in jiffies from now - * - * This is similar to slow_work_enqueue(), but it adds a delay before the work - * is actually queued for processing. - * - * The item can have delayed processing requested on it whilst it is being - * executed. The delay will begin immediately, and if it expires before the - * item finishes executing, the item will be placed back on the queue when it - * has done executing. - */ -int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, - unsigned long delay) -{ - struct slow_work *work = &dwork->work; - unsigned long flags; - int ret; - - if (delay == 0) - return slow_work_enqueue(&dwork->work); - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - goto cancelled; - - /* the timer holds a reference whilst it is pending */ - ret = slow_work_get_ref(work); - if (ret < 0) - goto cant_get_ref; - - if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) - BUG(); - dwork->timer.expires = jiffies + delay; - dwork->timer.data = (unsigned long) work; - dwork->timer.function = delayed_slow_work_timer; - add_timer(&dwork->timer); - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - - return 0; - -cancelled: - ret = -ECANCELED; -cant_get_ref: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(delayed_slow_work_enqueue); - -/* - * Schedule a cull of the thread pool at some time in the near future - */ -static void slow_work_schedule_cull(void) -{ - mod_timer(&slow_work_cull_timer, - round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); -} - -/* - * Worker thread culling algorithm - */ -static bool slow_work_cull_thread(void) -{ - unsigned long flags; - bool do_cull = false; - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (slow_work_cull) { - slow_work_cull = false; - - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) { - slow_work_schedule_cull(); - do_cull = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return do_cull; -} - -/* - * Determine if there is slow work available for dispatch - */ -static inline bool slow_work_available(int vsmax) -{ - return !list_empty(&slow_work_queue) || - (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax); -} - -/* - * Worker thread dispatcher - */ -static int slow_work_thread(void *_data) -{ - int vsmax, id; - - DEFINE_WAIT(wait); - - set_freezable(); - set_user_nice(current, -5); - - /* allocate ourselves an ID */ - spin_lock_irq(&slow_work_queue_lock); - id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); - __set_bit(id, slow_work_ids); - slow_work_set_thread_pid(id, current->pid); - spin_unlock_irq(&slow_work_queue_lock); - - sprintf(current->comm, "kslowd%03u", id); - - for (;;) { - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); - if (!freezing(current) && - !slow_work_threads_should_exit && - !slow_work_available(vsmax) && - !slow_work_cull) - schedule(); - finish_wait(&slow_work_thread_wq, &wait); - - try_to_freeze(); - - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - if (slow_work_available(vsmax) && slow_work_execute(id)) { - cond_resched(); - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) - slow_work_schedule_cull(); - continue; - } - - if (slow_work_threads_should_exit) - break; - - if (slow_work_cull && slow_work_cull_thread()) - break; - } - - spin_lock_irq(&slow_work_queue_lock); - slow_work_set_thread_pid(id, 0); - __clear_bit(id, slow_work_ids); - spin_unlock_irq(&slow_work_queue_lock); - - if (atomic_dec_and_test(&slow_work_thread_count)) - complete_and_exit(&slow_work_last_thread_exited, 0); - return 0; -} - -/* - * Handle thread cull timer expiration - */ -static void slow_work_cull_timeout(unsigned long data) -{ - slow_work_cull = true; - wake_up(&slow_work_thread_wq); -} - -/* - * Start a new slow work thread - */ -static void slow_work_new_thread_execute(struct slow_work *work) -{ - struct task_struct *p; - - if (slow_work_threads_should_exit) - return; - - if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) - return; - - if (!mutex_trylock(&slow_work_user_lock)) - return; - - slow_work_may_not_start_new_thread = true; - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) { - printk(KERN_DEBUG "Slow work thread pool: OOM\n"); - if (atomic_dec_and_test(&slow_work_thread_count)) - BUG(); /* we're running on a slow work thread... */ - mod_timer(&slow_work_oom_timer, - round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); - } else { - /* ratelimit the starting of new threads */ - mod_timer(&slow_work_oom_timer, jiffies + 1); - } - - mutex_unlock(&slow_work_user_lock); -} - -static const struct slow_work_ops slow_work_new_thread_ops = { - .owner = THIS_MODULE, - .execute = slow_work_new_thread_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = slow_work_new_thread_desc, -#endif -}; - -/* - * post-OOM new thread start suppression expiration - */ -static void slow_work_oom_timeout(unsigned long data) -{ - slow_work_may_not_start_new_thread = false; -} - -#ifdef CONFIG_SYSCTL -/* - * Handle adjustment of the minimum number of threads - */ -static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to start or stop threads */ - n = atomic_read(&slow_work_thread_count) - - slow_work_min_threads; - - if (n < 0 && !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - else if (n > 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} - -/* - * Handle adjustment of the maximum number of threads - */ -static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to stop threads */ - n = slow_work_max_threads - - atomic_read(&slow_work_thread_count); - - if (n < 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} -#endif /* CONFIG_SYSCTL */ - -/** - * slow_work_register_user - Register a user of the facility - * @module: The module about to make use of the facility - * - * Register a user of the facility, starting up the initial threads if there - * aren't any other users at this point. This will return 0 if successful, or - * an error if not. - */ -int slow_work_register_user(struct module *module) -{ - struct task_struct *p; - int loop; - - mutex_lock(&slow_work_user_lock); - - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); - init_completion(&slow_work_last_thread_exited); - - slow_work_threads_should_exit = false; - slow_work_init(&slow_work_new_thread, - &slow_work_new_thread_ops); - slow_work_may_not_start_new_thread = false; - slow_work_cull = false; - - /* start the minimum number of threads */ - for (loop = 0; loop < slow_work_min_threads; loop++) { - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) - goto error; - } - printk(KERN_NOTICE "Slow work thread pool: Ready\n"); - } - - slow_work_user_count++; - mutex_unlock(&slow_work_user_lock); - return 0; - -error: - if (atomic_dec_and_test(&slow_work_thread_count)) - complete(&slow_work_last_thread_exited); - if (loop > 0) { - printk(KERN_ERR "Slow work thread pool:" - " Aborting startup on ENOMEM\n"); - slow_work_threads_should_exit = true; - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_ERR "Slow work thread pool: Aborted\n"); - } - mutex_unlock(&slow_work_user_lock); - return PTR_ERR(p); -} -EXPORT_SYMBOL(slow_work_register_user); - -/* - * wait for all outstanding items from the calling module to complete - * - note that more items may be queued whilst we're waiting - */ -static void slow_work_wait_for_items(struct module *module) -{ -#ifdef CONFIG_MODULES - DECLARE_WAITQUEUE(myself, current); - struct slow_work *work; - int loop; - - mutex_lock(&slow_work_unreg_sync_lock); - add_wait_queue(&slow_work_unreg_wq, &myself); - - for (;;) { - spin_lock_irq(&slow_work_queue_lock); - - /* first of all, we wait for the last queued item in each list - * to be processed */ - list_for_each_entry_reverse(work, &vslow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - list_for_each_entry_reverse(work, &slow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - - /* then we wait for the items being processed to finish */ - slow_work_unreg_module = module; - smp_mb(); - for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { - if (slow_work_thread_processing[loop] == module) - goto do_wait; - } - spin_unlock_irq(&slow_work_queue_lock); - break; /* okay, we're done */ - - do_wait: - spin_unlock_irq(&slow_work_queue_lock); - schedule(); - slow_work_unreg_work_item = NULL; - slow_work_unreg_module = NULL; - } - - remove_wait_queue(&slow_work_unreg_wq, &myself); - mutex_unlock(&slow_work_unreg_sync_lock); -#endif /* CONFIG_MODULES */ -} - -/** - * slow_work_unregister_user - Unregister a user of the facility - * @module: The module whose items should be cleared - * - * Unregister a user of the facility, killing all the threads if this was the - * last one. - * - * This waits for all the work items belonging to the nominated module to go - * away before proceeding. - */ -void slow_work_unregister_user(struct module *module) -{ - /* first of all, wait for all outstanding items from the calling module - * to complete */ - if (module) - slow_work_wait_for_items(module); - - /* then we can actually go about shutting down the facility if need - * be */ - mutex_lock(&slow_work_user_lock); - - BUG_ON(slow_work_user_count <= 0); - - slow_work_user_count--; - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); - slow_work_threads_should_exit = true; - del_timer_sync(&slow_work_cull_timer); - del_timer_sync(&slow_work_oom_timer); - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_NOTICE "Slow work thread pool:" - " Shut down complete\n"); - } - - mutex_unlock(&slow_work_user_lock); -} -EXPORT_SYMBOL(slow_work_unregister_user); - -/* - * Initialise the slow work facility - */ -static int __init init_slow_work(void) -{ - unsigned nr_cpus = num_possible_cpus(); - - if (slow_work_max_threads < nr_cpus) - slow_work_max_threads = nr_cpus; -#ifdef CONFIG_SYSCTL - if (slow_work_max_max_threads < nr_cpus * 2) - slow_work_max_max_threads = nr_cpus * 2; -#endif -#ifdef CONFIG_SLOW_WORK_DEBUG - { - struct dentry *dbdir; - - dbdir = debugfs_create_dir("slow_work", NULL); - if (dbdir && !IS_ERR(dbdir)) - debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, - NULL, &slow_work_runqueue_fops); - } -#endif - return 0; -} - -subsys_initcall(init_slow_work); diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41d..000000000000 --- a/kernel/slow-work.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Slow work private definitions - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ - -#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ - -/* - * slow-work.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern struct slow_work *slow_work_execs[]; -extern pid_t slow_work_pids[]; -extern rwlock_t slow_work_execs_lock; -#endif - -extern struct list_head slow_work_queue; -extern struct list_head vslow_work_queue; -extern spinlock_t slow_work_queue_lock; - -/* - * slow-work-debugfs.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern const struct file_operations slow_work_runqueue_fops; - -extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); -#endif - -/* - * Helper functions - */ -static inline void slow_work_set_thread_pid(int id, pid_t pid) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_pids[id] = pid; -#endif -} - -static inline void slow_work_mark_time(struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - work->mark = CURRENT_TIME; -#endif -} - -static inline void slow_work_begin_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_execs[id] = work; -#endif -} - -static inline void slow_work_end_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - write_lock(&slow_work_execs_lock); - slow_work_execs[id] = NULL; - write_unlock(&slow_work_execs_lock); -#endif -} diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 70f8d90331e9..4372ccb25127 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,9 +35,9 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { spinlock_t lock; + bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ struct task_struct *thread; /* stopper thread */ - bool enabled; /* is this stopper enabled? */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); diff --git a/kernel/sys.c b/kernel/sys.c index e83ddbbaf89d..e9ad44489828 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1236,15 +1236,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } + struct rlimit value; + int ret; + + ret = do_prlimit(current, resource, NULL, &value); + if (!ret) + ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + + return ret; } #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT @@ -1272,44 +1271,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +static inline bool rlim64_is_infinity(__u64 rlim64) { - struct rlimit new_rlim, *old_rlim; - int retval; +#if BITS_PER_LONG < 64 + return rlim64 >= ULONG_MAX; +#else + return rlim64 == RLIM64_INFINITY; +#endif +} + +static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) +{ + if (rlim->rlim_cur == RLIM_INFINITY) + rlim64->rlim_cur = RLIM64_INFINITY; + else + rlim64->rlim_cur = rlim->rlim_cur; + if (rlim->rlim_max == RLIM_INFINITY) + rlim64->rlim_max = RLIM64_INFINITY; + else + rlim64->rlim_max = rlim->rlim_max; +} + +static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) +{ + if (rlim64_is_infinity(rlim64->rlim_cur)) + rlim->rlim_cur = RLIM_INFINITY; + else + rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; + if (rlim64_is_infinity(rlim64->rlim_max)) + rlim->rlim_max = RLIM_INFINITY; + else + rlim->rlim_max = (unsigned long)rlim64->rlim_max; +} + +/* make sure you are allowed to change @tsk limits before calling this */ +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - - retval = security_task_setrlimit(resource, &new_rlim); - if (retval) - return retval; - - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim.rlim_cur = 1; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; } - task_lock(current->group_leader); - *old_rlim = new_rlim; - task_unlock(current->group_leader); - - if (resource != RLIMIT_CPU) + /* protect tsk->signal and tsk->sighand from disappearing */ + read_lock(&tasklist_lock); + if (!tsk->sighand) { + retval = -ESRCH; goto out; + } + + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, + resource, new_rlim); + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim->rlim_cur = 1; + } + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Note that the kernel fails to return an error @@ -1317,14 +1361,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim.rlim_cur == RLIM_INFINITY) - goto out; - - update_rlimit_cpu(new_rlim.rlim_cur); + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY) + update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: + read_unlock(&tasklist_lock); + return retval; +} + +/* rcu lock must be held */ +static int check_prlimit_permission(struct task_struct *task) +{ + const struct cred *cred = current_cred(), *tcred; + + tcred = __task_cred(task); + if ((cred->uid != tcred->euid || + cred->uid != tcred->suid || + cred->uid != tcred->uid || + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && + !capable(CAP_SYS_RESOURCE)) { + return -EPERM; + } + return 0; } +SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, + const struct rlimit64 __user *, new_rlim, + struct rlimit64 __user *, old_rlim) +{ + struct rlimit64 old64, new64; + struct rlimit old, new; + struct task_struct *tsk; + int ret; + + if (new_rlim) { + if (copy_from_user(&new64, new_rlim, sizeof(new64))) + return -EFAULT; + rlim64_to_rlim(&new64, &new); + } + + rcu_read_lock(); + tsk = pid ? find_task_by_vpid(pid) : current; + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + ret = check_prlimit_permission(tsk); + if (ret) { + rcu_read_unlock(); + return ret; + } + get_task_struct(tsk); + rcu_read_unlock(); + + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + + if (!ret && old_rlim) { + rlim_to_rlim64(&old, &old64); + if (copy_to_user(old_rlim, &old64, sizeof(old64))) + ret = -EFAULT; + } + + put_task_struct(tsk); + return ret; +} + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_prlimit(current, resource, &new_rlim, NULL); +} + /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea758ffe..bad369ec5403 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); + +/* fanotify! */ +cond_syscall(sys_fanotify_init); +cond_syscall(sys_fanotify_mark); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f79c7f81c96..ca38e8e3e907 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,16 +44,17 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> +#include <linux/dnotify.h> #include <linux/syscalls.h> #include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/slow-work.h> #include <linux/perf_event.h> #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> +#include <linux/oom.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -86,9 +87,6 @@ /* External variables not in a header file. */ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -extern int sysctl_panic_on_oom; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_oom_dump_tasks; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -134,6 +132,9 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +#ifdef CONFIG_INOTIFY_USER +#include <linux/inotify.h> +#endif #ifdef CONFIG_SPARC #include <asm/system.h> #endif @@ -210,9 +211,6 @@ static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; extern struct ctl_table random_table[]; -#ifdef CONFIG_INOTIFY_USER -extern struct ctl_table inotify_table[]; -#endif #ifdef CONFIG_EPOLL extern struct ctl_table epoll_table[]; #endif @@ -566,7 +564,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#ifdef CONFIG_HOTPLUG { .procname = "hotplug", .data = &uevent_helper, @@ -917,13 +915,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SLOW_WORK - { - .procname = "slow-work", - .mode = 0555, - .child = slow_work_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS { .procname = "perf_event_paranoid", diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09a..ba9b338d1835 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0a..f06a8a365648 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -6,7 +6,7 @@ config TICK_ONESHOT config NO_HZ bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables a tickless system: timer interrupts will @@ -15,7 +15,7 @@ config NO_HZ config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d561..c18d7efa1b4b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) return max_nsecs - (max_nsecs >> 5); } -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET /** * clocksource_select - Select the best clocksource available @@ -577,7 +577,7 @@ static void clocksource_select(void) } } -#else /* CONFIG_GENERIC_TIME */ +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) #define MAX_UPDATE_LENGTH 5 /* Seconds */ /** - * __clocksource_register_scale - Used to install new clocksources + * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * - * Returns -EBUSY if registration fails, zero otherwise. + * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_register_hz() or clocksource_register_khz helper functions. + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. */ -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* * Ideally we want to use some of the limits used in * clocksource_max_deferment, to provide a more informed @@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) NSEC_PER_SEC/scale, MAX_UPDATE_LENGTH*scale); cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Intialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5fc66d..48b2761b5668 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb61..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { + arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); - u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c8..49010d822f72 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static struct timespec xtime __attribute__ ((aligned (16))); +static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; /* @@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } -#ifdef CONFIG_GENERIC_TIME - /** * timekeeping_forward_now - update clock to the current time * @@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) tick_clock_notify(); } -#else /* GENERIC_TIME */ - -static inline void timekeeping_forward_now(void) { } - -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -#endif /* !GENERIC_TIME */ - /** * ktime_get_real - get the real (wall-) time in ktime_t format * @@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add_safe(xtime, ts); + xtime = timespec_add(xtime, ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add_safe(total_sleep_time, ts); + total_sleep_time = timespec_add(total_sleep_time, ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset) static cycle_t logarithmic_accumulation(cycle_t offset, int shift) { u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ if (offset < timekeeper.cycle_interval<<shift) @@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) second_overflow(); } - /* Accumulate into raw time */ - raw_time.tv_nsec += timekeeper.raw_interval << shift;; - while (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; + /* Accumulate raw time */ + raw_nsecs = timekeeper.raw_interval << shift; + raw_nsecs += raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; + raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); + raw_time.tv_sec += raw_secs; } + raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; @@ -784,10 +742,11 @@ void update_wall_time(void) return; clock = timekeeper.clock; -#ifdef CONFIG_GENERIC_TIME - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; -#else + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = timekeeper.cycle_interval; +#else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; @@ -856,7 +815,8 @@ void update_wall_time(void) } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } /** @@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add_safe(*ts, total_sleep_time); + *ts = timespec_add(*ts, total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void) return xtime; } +struct timespec __get_wall_to_monotonic(void) +{ + return wall_to_monotonic; +} + struct timespec current_kernel_time(void) { struct timespec now; diff --git a/kernel/timer.c b/kernel/timer.c index c29e2d4d2a66..97bf05baade7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. */ #define TBASE_DEFERRABLE_FLAG (0x1) @@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); /** * set_timer_slack - set the allowed slack for a timer + * @timer: the timer to be modified * @slack_hz: the amount of time (in jiffies) allowed for rounding * * Set the amount of time, in jiffies, that a certain timer has @@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); @@ -1749,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs) } EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c7683fd8a03a..538501c6ea50 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -153,7 +153,7 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE @@ -175,7 +175,7 @@ config IRQSOFF_TRACER config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE @@ -323,17 +323,6 @@ config STACK_TRACER Say N if unsure. -config WORKQUEUE_TRACER - bool "Trace workqueues" - select GENERIC_TRACER - help - The workqueue tracer provides some statistical information - about each cpu workqueue thread such as the number of the - works inserted and executed since their creation. It can help - to evaluate the amount of work each of them has to perform. - For example it can help a developer to decide whether he should - choose a per-cpu workqueue instead of a singlethreaded one. - config BLK_DEV_IO_TRACE bool "Support for tracing block IO actions" depends on SYSFS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 438e84a56ab3..53f338190b26 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,5 +53,8 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 638711c17504..959f8d6c8cc1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +#define BLK_TC_HARDBARRIER BLK_TC_BARRIER +#define BLK_TC_RAHEAD BLK_TC_AHEAD + /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNCIO); - what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, HARDBARRIER); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); @@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + memcpy(&buts.name, &cbuts.name, 32); + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, 32)) { + blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + int blk_trace_startstop(struct request_queue *q, int start) { int ret; @@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; + lock_kernel(); mutex_lock(&bdev->bd_mutex); switch (cmd) { @@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) bdevname(bdev, b); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + bdevname(bdev, b); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif case BLKTRACESTART: start = 1; case BLKTRACESTOP: @@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } mutex_unlock(&bdev->bd_mutex); + unlock_kernel(); return ret; } @@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; - if (blk_pc_request(rq)) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, what, rq->errors, rq->cmd_len, rq->cmd); @@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (blk_pc_request(rq)) + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, rq->errors, len, data); else @@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq) int len = rq->cmd_len; unsigned char *cmd = rq->cmd; - if (!blk_pc_request(rq)) { + if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { buf[0] = '\0'; return; } @@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & WRITE) rwbs[i++] = 'W'; - else if (rw & 1 << BIO_RW_DISCARD) + else if (rw & REQ_DISCARD) rwbs[i++] = 'D'; else if (bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (rw & 1 << BIO_RW_AHEAD) + if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & 1 << BIO_RW_BARRIER) + if (rw & REQ_HARDBARRIER) rwbs[i++] = 'B'; - if (rw & 1 << BIO_RW_SYNCIO) + if (rw & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & 1 << BIO_RW_META) + if (rw & REQ_META) rwbs[i++] = 'M'; + if (rw & REQ_SECURE) + rwbs[i++] = 'E'; rwbs[i] = '\0'; } @@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) int rw = rq->cmd_flags & 0x03; int bytes; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; bytes = blk_rq_bytes(rq); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3632ce87674f..19cccc3c3028 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3846,6 +3846,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rpos = reader->read; pos += size; + if (rpos >= commit) + break; + event = rb_reader_event(cpu_buffer); size = rb_event_length(event); } while (len > size); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4b1122d0df37..9ec59f541156 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_var_t __read_mostly tracing_buffer_mask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu(cpu, tracing_buffer_mask) +cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -744,13 +741,6 @@ __acquires(kernel_lock) return -1; } - /* - * When this gets called we hold the BKL which means that - * preemption is disabled. Various trace selftests however - * need to disable and enable preemption for successful tests. - * So we drop the BKL here and grab it after the tests again. - */ - unlock_kernel(); mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -832,7 +822,6 @@ __acquires(kernel_lock) #endif out_unlock: - lock_kernel(); return ret; } @@ -1493,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; - static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ @@ -1595,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, } /* Find the next real entry, and increment the iterator to the next entry */ -static void *find_next_entry_inc(struct trace_iterator *iter) +void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); @@ -1630,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return NULL; if (iter->idx < 0) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } -static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct trace_array *tr = iter->tr; struct ring_buffer_event *event; @@ -2003,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) } /* Called with trace_event_read_lock() held. */ -static enum print_line_t print_trace_line(struct trace_iterator *iter) +enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -3193,7 +3177,7 @@ waitagain: trace_event_read_lock(); trace_access_lock(iter->cpu_file); - while (find_next_entry_inc(iter) != NULL) { + while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3276,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; - if (!find_next_entry_inc(iter)) { + if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; @@ -3332,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (ret <= 0) goto out_err; - if (!iter->ent && !find_next_entry_inc(iter)) { + if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } @@ -3479,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; + size_t written; if (tracing_disabled) return -EINVAL; @@ -3500,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else buf[cnt] = '\0'; - cnt = mark_printk("%s", buf); + written = mark_printk("%s", buf); kfree(buf); - *fpos += cnt; + *fpos += written; - return cnt; + /* don't tell userspace we wrote more - it might confuse them */ + if (written > cnt) + written = cnt; + + return written; } static int tracing_clock_show(struct seq_file *m, void *v) @@ -4402,7 +4391,7 @@ static struct notifier_block trace_die_notifier = { */ #define KERN_TRACE KERN_EMERG -static void +void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ @@ -4417,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + static void __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { @@ -4442,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + trace_init_global_iter(&iter); + for_each_tracing_cpu(cpu) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&iter.tr->data[cpu]->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4492,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (find_next_entry_inc(&iter) != NULL) { + if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); @@ -4514,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&iter.tr->data[cpu]->disabled); } tracing_on(); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d05c873dd4b2..d39b3c5454a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -314,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); @@ -351,6 +359,15 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -436,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 52fda6c04ac3..685a67d55db0 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -55,7 +55,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 09b4fa6e4d3b..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -598,88 +598,146 @@ out: return ret; } -static void print_event_fields(struct trace_seq *s, struct list_head *head) +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) { + struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); - list_for_each_entry_reverse(field, head, link) { - /* - * Smartly shows the array type(except dynamic array). - * Normal: - * field:TYPE VAR - * If TYPE := TYPE[LEN], it is shown: - * field:TYPE VAR[LEN] - */ - const char *array_descriptor = strchr(field->type, '['); + (*pos)++; - if (!strncmp(field->type, "__data_loc", 10)) - array_descriptor = NULL; + switch ((unsigned long)v) { + case FORMAT_HEADER: + if (unlikely(list_empty(common_head))) + return NULL; - if (!array_descriptor) { - trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - field->type, field->name, field->offset, - field->size, !!field->is_signed); - } else { - trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - (int)(array_descriptor - field->type), - field->type, field->name, - array_descriptor, field->offset, - field->size, !!field->is_signed); - } + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + + case FORMAT_FIELD_SEPERATOR: + if (unlikely(list_empty(head))) + return NULL; + + field = list_entry(head->prev, struct ftrace_event_field, link); + return field; + + case FORMAT_PRINTFMT: + /* all done */ + return NULL; } + + field = v; + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) + return (void *)FORMAT_PRINTFMT; + + field = list_entry(field->link.prev, struct ftrace_event_field, link); + + return field; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +static void *f_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call = filp->private_data; - struct list_head *head; - struct trace_seq *s; - char *buf; - int r; + loff_t l = 0; + void *p; - if (*ppos) + /* Start by showing the header */ + if (!*pos) + return (void *)FORMAT_HEADER; + + p = (void *)FORMAT_HEADER; + do { + p = f_next(m, p, &l); + } while (p && l < *pos); + + return p; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "ID: %d\n", call->event.type); + seq_printf(m, "format:\n"); return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; - trace_seq_init(s); + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; + } - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); + field = v; - /* print common fields */ - print_event_fields(s, &ftrace_common_fields); + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); - trace_seq_putc(s, '\n'); + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; - /* print event specific fields */ - head = trace_get_fields(call); - print_event_fields(s, head); + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); - r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); + return 0; +} - if (!r) { - /* - * ug! The format output is bigger than a PAGE!! - */ - buf = "FORMAT TOO BIG\n"; - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, strlen(buf)); - goto out; - } +static void f_stop(struct seq_file *m, void *p) +{ +} - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - out: - kfree(s); - return r; +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct ftrace_event_call *call = inode->i_private; + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = call; + + return 0; } static ssize_t @@ -877,8 +935,10 @@ static const struct file_operations ftrace_enable_fops = { }; static const struct file_operations ftrace_event_format_fops = { - .open = tracing_open_generic, - .read = event_format_read, + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index fcb5a542cd21..c93bcb248638 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter, * if the output fails. */ data->ent = *curr; - data->ret = *next; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; } } diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 000000000000..7b8ecd751d93 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,136 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "../debug/kdb/kdb_private.h" +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d38dff4..25915832291a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb4451..2994a0e3a61c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,41 +33,287 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> -#define CREATE_TRACE_POINTS -#include <trace/events/workqueue.h> +#include <linux/idr.h> + +#include "workqueue_sched.h" + +enum { + /* global_cwq flags */ + GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ + GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 3, /* freeze in progress */ + GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + + /* worker flags */ + WORKER_STARTED = 1 << 0, /* started */ + WORKER_DIE = 1 << 1, /* die die die */ + WORKER_IDLE = 1 << 2, /* is idle */ + WORKER_PREP = 1 << 3, /* preparing to run works */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ + + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ + BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, + BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, + + MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ + IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ + + MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ + CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ + + /* + * Rescue workers are used only on emergencies and shared by + * all cpus. Give -20. + */ + RESCUER_NICE_LEVEL = -20, +}; /* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). + * Structure fields follow one of the following exclusion rules. + * + * I: Set during initialization and read-only afterwards. + * + * P: Preemption protected. Disabling preemption is enough and should + * only be modified and accessed from the local cpu. + * + * L: gcwq->lock protected. Access with gcwq->lock held. + * + * X: During normal operation, modification requires gcwq->lock and + * should be done only from local cpu. Either disabling preemption + * on local cpu or grabbing gcwq->lock is enough for read access. + * If GCWQ_DISASSOCIATED is set, it's identical to L. + * + * F: wq->flush_mutex protected. + * + * W: workqueue_lock protected. */ -struct cpu_workqueue_struct { - spinlock_t lock; +struct global_cwq; - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; +/* + * The poor guys doing the actual heavy lifting. All on-duty workers + * are either serving the manager role, on idle list or on busy hash. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; - struct workqueue_struct *wq; - struct task_struct *thread; -} ____cacheline_aligned; + struct work_struct *current_work; /* L: work being processed */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct global_cwq *gcwq; /* I: the associated gcwq */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ +}; + +/* + * Global per-cpu workqueue. There's one and only one for each cpu + * and all works are queued and processed here regardless of their + * target workqueues. + */ +struct global_cwq { + spinlock_t lock; /* the gcwq lock */ + struct list_head worklist; /* L: list of pending works */ + unsigned int cpu; /* I: the associated cpu */ + unsigned int flags; /* L: GCWQ_* flags */ + + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + /* workers are chained either in the idle_list or busy_hash */ + struct list_head idle_list; /* X: list of idle workers */ + struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* L: hash of busy workers */ + + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for dworkers */ + + struct ida worker_ida; /* L: for worker IDs */ + + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker *first_idle; /* L: first idle worker */ +} ____cacheline_aligned_in_smp; + +/* + * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. + */ +struct cpu_workqueue_struct { + struct global_cwq *gcwq; /* I: the associated gcwq */ + struct workqueue_struct *wq; /* I: the owning workqueue */ + int work_color; /* L: current color */ + int flush_color; /* L: flushing color */ + int nr_in_flight[WORK_NR_COLORS]; + /* L: nr of in_flight works */ + int nr_active; /* L: nr of active works */ + int max_active; /* L: max active works */ + struct list_head delayed_works; /* L: delayed works */ +}; + +/* + * Structure used to wait for workqueue flush. + */ +struct wq_flusher { + struct list_head list; /* F: list of flushers */ + int flush_color; /* F: flush color waiting for */ + struct completion done; /* flush completion */ +}; + +/* + * All cpumasks are assumed to be always set on UP and thus can't be + * used to determine whether there's something to be done. + */ +#ifdef CONFIG_SMP +typedef cpumask_var_t mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) \ + cpumask_test_and_set_cpu((cpu), (mask)) +#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) +#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) +#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) +#define free_mayday_mask(mask) free_cpumask_var((mask)) +#else +typedef unsigned long mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) +#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) +#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) +#define alloc_mayday_mask(maskp, gfp) true +#define free_mayday_mask(mask) do { } while (0) +#endif /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; + unsigned int flags; /* I: WQ_* flags */ + union { + struct cpu_workqueue_struct __percpu *pcpu; + struct cpu_workqueue_struct *single; + unsigned long v; + } cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + + struct mutex flush_mutex; /* protects wq flushing */ + int work_color; /* F: current work color */ + int flush_color; /* F: current flush color */ + atomic_t nr_cwqs_to_flush; /* flush in progress */ + struct wq_flusher *first_flusher; /* F: first flusher */ + struct list_head flusher_queue; /* F: flush waiters */ + struct list_head flusher_overflow; /* F: flush overflow list */ + + mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct worker *rescuer; /* I: rescue worker */ + + int saved_max_active; /* W: saved cwq max_active */ + const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; +struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_nrt_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_long_wq); +EXPORT_SYMBOL_GPL(system_nrt_wq); +EXPORT_SYMBOL_GPL(system_unbound_wq); + +#define for_each_busy_worker(worker, i, pos, gcwq) \ + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ + hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + +static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) +{ + if (cpu < nr_cpu_ids) { + if (sw & 1) { + cpu = cpumask_next(cpu, mask); + if (cpu < nr_cpu_ids) + return cpu; + } + if (sw & 2) + return WORK_CPU_UNBOUND; + } + return WORK_CPU_NONE; +} + +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) +{ + return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +} + +/* + * CPU iterators + * + * An extra gcwq is defined for an invalid cpu number + * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any + * specific CPU. The following iterators are similar to + * for_each_*_cpu() iterators but also considers the unbound gcwq. + * + * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * WORK_CPU_UNBOUND for unbound workqueues + */ +#define for_each_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + +#define for_each_online_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + +#define for_each_cwq_cpu(cpu, wq) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + +#ifdef CONFIG_LOCKDEP +/** + * in_workqueue_context() - in context of specified workqueue? + * @wq: the workqueue of interest + * + * Checks lockdep state to see if the current task is executing from + * within a workqueue item. This function exists only if lockdep is + * enabled. + */ +int in_workqueue_context(struct workqueue_struct *wq) +{ + return lock_is_held(&wq->lockdep_map); +} +#endif + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; @@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); return 0; @@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { } /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* + * The almighty global cpu workqueues. nr_running is the only field + * which is expected to be used frequently by other cpus via + * try_to_wake_up(). Put it in a separate cacheline. + */ +static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); -static int singlethread_cpu __read_mostly; -static const struct cpumask *cpu_singlethread_map __read_mostly; /* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. + * Global cpu workqueue and nr_running counter for unbound gcwq. The + * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its + * workers have WORKER_UNBOUND set. */ -static cpumask_var_t cpu_populated_map __read_mostly; +static struct global_cwq unbound_global_cwq; +static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) +static int worker_thread(void *__worker); + +static struct global_cwq *get_gcwq(unsigned int cpu) { - return wq->singlethread; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(global_cwq, cpu); + else + return &unbound_global_cwq; } -static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +static atomic_t *get_gcwq_nr_running(unsigned int cpu) { - return is_wq_single_threaded(wq) - ? cpu_singlethread_map : cpu_populated_map; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(gcwq_nr_running, cpu); + else + return &unbound_gcwq_nr_running; } -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) { - if (unlikely(is_wq_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); + if (!(wq->flags & WQ_UNBOUND)) { + if (likely(cpu < nr_cpu_ids)) { +#ifdef CONFIG_SMP + return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +#else + return wq->cpu_wq.single; +#endif + } + } else if (likely(cpu == WORK_CPU_UNBOUND)) + return wq->cpu_wq.single; + return NULL; +} + +static unsigned int work_color_to_flags(int color) +{ + return color << WORK_STRUCT_COLOR_SHIFT; +} + +static int get_work_color(struct work_struct *work) +{ + return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + ((1 << WORK_STRUCT_COLOR_BITS) - 1); +} + +static int work_next_color(int color) +{ + return (color + 1) % WORK_NR_COLORS; } /* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set + * A work's data points to the cwq with WORK_STRUCT_CWQ set while the + * work is on queue. Once execution starts, WORK_STRUCT_CWQ is + * cleared and the work data contains the cpu number it was last on. + * + * set_work_{cwq|cpu}() and clear_work_data() can be used to set the + * cwq, cpu or clear work->data. These functions should only be + * called while the work is owned - ie. while the PENDING bit is set. + * + * get_work_[g]cwq() can be used to obtain the gcwq or cwq + * corresponding to a work. gcwq is available once the work has been + * queued anywhere after initialization. cwq is available only from + * queueing until execution starts. */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) +static inline void set_work_data(struct work_struct *work, unsigned long data, + unsigned long flags) { - unsigned long new; - BUG_ON(!work_pending(work)); + atomic_long_set(&work->data, data | flags | work_static(work)); +} + +static void set_work_cwq(struct work_struct *work, + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) +{ + set_work_data(work, (unsigned long)cwq, + WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +} + +static void set_work_cpu(struct work_struct *work, unsigned int cpu) +{ + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +} + +static void clear_work_data(struct work_struct *work) +{ + set_work_data(work, WORK_STRUCT_NO_CPU, 0); +} + +static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + else + return NULL; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + unsigned int cpu; + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + + cpu = data >> WORK_STRUCT_FLAG_BITS; + if (cpu == WORK_CPU_NONE) + return NULL; + + BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); + return get_gcwq(cpu); +} + +/* + * Policy functions. These define the policies on how the global + * worker pool is managed. Unless noted otherwise, these functions + * assume that they're being called with gcwq->lock held. + */ + +static bool __need_more_worker(struct global_cwq *gcwq) +{ + return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || + gcwq->flags & GCWQ_HIGHPRI_PENDING; +} + +/* + * Need to wake up a worker? Called from anything but currently + * running workers. + */ +static bool need_more_worker(struct global_cwq *gcwq) +{ + return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); +} + +/* Can I start working? Called from busy but !running workers. */ +static bool may_start_working(struct global_cwq *gcwq) +{ + return gcwq->nr_idle; +} + +/* Do I need to keep working? Called from currently running workers. */ +static bool keep_working(struct global_cwq *gcwq) +{ + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; +} + +/* Do we need a new worker? Called from manager. */ +static bool need_to_create_worker(struct global_cwq *gcwq) +{ + return need_more_worker(gcwq) && !may_start_working(gcwq); +} + +/* Do I need to be the manager? */ +static bool need_to_manage_workers(struct global_cwq *gcwq) +{ + return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; +} + +/* Do we have too many workers and should some go away? */ +static bool too_many_workers(struct global_cwq *gcwq) +{ + bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->nr_workers - nr_idle; - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /* - * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + * Wake up functions. */ -static inline void clear_wq_data(struct work_struct *work) + +/* Return the first worker. Safe with preemption disabled */ +static struct worker *first_worker(struct global_cwq *gcwq) { - unsigned long flags = *work_data_bits(work) & - (1UL << WORK_STRUCT_STATIC); - atomic_long_set(&work->data, flags); + if (unlikely(list_empty(&gcwq->idle_list))) + return NULL; + + return list_first_entry(&gcwq->idle_list, struct worker, entry); } -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +/** + * wake_up_worker - wake up an idle worker + * @gcwq: gcwq to wake worker for + * + * Wake up the first idle worker of @gcwq. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void wake_up_worker(struct global_cwq *gcwq) { - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + struct worker *worker = first_worker(gcwq); + + if (likely(worker)) + wake_up_process(worker->task); +} + +/** + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +{ + struct worker *worker = kthread_data(task); + + if (likely(!(worker->flags & WORKER_NOT_RUNNING))) + atomic_inc(get_gcwq_nr_running(cpu)); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * RETURNS: + * Worker task on @cpu to wake up, %NULL if none. + */ +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct global_cwq *gcwq = get_gcwq(cpu); + atomic_t *nr_running = get_gcwq_nr_running(cpu); + + if (unlikely(worker->flags & WORKER_NOT_RUNNING)) + return NULL; + + /* this can only happen on the local cpu */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. + * + * NOT_RUNNING is clear. This means that trustee is not in + * charge and we're running on the local cpu w/ rq lock held + * and preemption disabled, which in turn means that none else + * could be manipulating idle_list, so dereferencing idle_list + * without gcwq lock is safe. + */ + if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + to_wakeup = first_worker(gcwq); + return to_wakeup ? to_wakeup->task : NULL; +} + +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * @wakeup: wakeup an idle worker if necessary + * + * Set @flags in @worker->flags and adjust nr_running accordingly. If + * nr_running becomes zero and @wakeup is %true, an idle worker is + * woken up. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags, + bool wakeup) +{ + struct global_cwq *gcwq = worker->gcwq; + + WARN_ON_ONCE(worker->task != current); + + /* + * If transitioning into NOT_RUNNING, adjust nr_running and + * wake up an idle worker as necessary if requested by + * @wakeup. + */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + if (wakeup) { + if (atomic_dec_and_test(nr_running) && + !list_empty(&gcwq->worklist)) + wake_up_worker(gcwq); + } else + atomic_dec(nr_running); + } + + worker->flags |= flags; } +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct global_cwq *gcwq = worker->gcwq; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* if transitioning out of NOT_RUNNING, increment nr_running */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(gcwq->cpu)); +} + +/** + * busy_worker_head - return the busy hash head for a work + * @gcwq: gcwq of interest + * @work: work to be hashed + * + * Return hash head of @gcwq for @work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to the hash head. + */ +static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, + struct work_struct *work) +{ + const int base_shift = ilog2(sizeof(struct work_struct)); + unsigned long v = (unsigned long)work; + + /* simple shift and fold hash, do we need something better? */ + v >>= base_shift; + v += v >> BUSY_WORKER_HASH_ORDER; + v &= BUSY_WORKER_HASH_MASK; + + return &gcwq->busy_hash[v]; +} + +/** + * __find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @bwh: hash head as returned by busy_worker_head() + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. @bwh should be + * the hash head obtained by calling busy_worker_head() with the same + * work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, + struct hlist_head *bwh, + struct work_struct *work) +{ + struct worker *worker; + struct hlist_node *tmp; + + hlist_for_each_entry(worker, tmp, bwh, hentry) + if (worker->current_work == work) + return worker; + return NULL; +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. This function is + * identical to __find_worker_executing_work() except that this + * function calculates @bwh itself. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct global_cwq *gcwq, + struct work_struct *work) +{ + return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), + work); +} + +/** + * gcwq_determine_ins_pos - find insertion position + * @gcwq: gcwq of interest + * @cwq: cwq a work is being queued for + * + * A work for @cwq is about to be queued on @gcwq, determine insertion + * position for the work. If @cwq is for HIGHPRI wq, the work is + * queued at the head of the queue but in FIFO order with respect to + * other HIGHPRI works; otherwise, at the end of the queue. This + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * there are HIGHPRI works pending. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to inserstion position. + */ +static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, + struct cpu_workqueue_struct *cwq) +{ + struct work_struct *twork; + + if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) + return &gcwq->worklist; + + list_for_each_entry(twork, &gcwq->worklist, entry) { + struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); + + if (!(tcwq->wq->flags & WQ_HIGHPRI)) + break; + } + + gcwq->flags |= GCWQ_HIGHPRI_PENDING; + return &twork->entry; +} + +/** + * insert_work - insert a work into gcwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work which belongs to @cwq into @gcwq after @head. + * @extra_flags is or'd to work_struct flags. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) { - trace_workqueue_insertion(cwq->thread, work); + struct global_cwq *gcwq = cwq->gcwq; + + /* we own @work, set data and link */ + set_work_cwq(work, cwq, extra_flags); - set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); + list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); + + /* + * Ensure either worker_sched_deactivated() sees the above + * list_add_tail() or we see zero nr_running to avoid workers + * lying around lazily while there are works to be processed. + */ + smp_mb(); + + if (__need_more_worker(gcwq)) + wake_up_worker(gcwq); } -static void __queue_work(struct cpu_workqueue_struct *cwq, +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; unsigned long flags; debug_work_activate(work); - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); + + /* determine gcwq to use */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *last_gcwq; + + if (unlikely(cpu == WORK_CPU_UNBOUND)) + cpu = raw_smp_processor_id(); + + /* + * It's multi cpu. If @wq is non-reentrant and @work + * was previously on a different cpu, it might still + * be running there, in which case the work needs to + * be queued on that cpu to guarantee non-reentrance. + */ + gcwq = get_gcwq(cpu); + if (wq->flags & WQ_NON_REENTRANT && + (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + struct worker *worker; + + spin_lock_irqsave(&last_gcwq->lock, flags); + + worker = find_worker_executing_work(last_gcwq, work); + + if (worker && worker->current_cwq->wq == wq) + gcwq = last_gcwq; + else { + /* meh... not running there, queue here */ + spin_unlock_irqrestore(&last_gcwq->lock, flags); + spin_lock_irqsave(&gcwq->lock, flags); + } + } else + spin_lock_irqsave(&gcwq->lock, flags); + } else { + gcwq = get_gcwq(WORK_CPU_UNBOUND); + spin_lock_irqsave(&gcwq->lock, flags); + } + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + + if (likely(cwq->nr_active < cwq->max_active)) { + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else + worklist = &cwq->delayed_works; + + insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); ret = 1; } return ret; @@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } /** @@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + unsigned int lcpu; + BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + /* + * This stores cwq for the moment, for the timer_fn. + * Note that the work's gcwq is preserved to allow + * reentrance detection for delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else + lcpu = WORK_CPU_UNBOUND; + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -static void run_workqueue(struct cpu_workqueue_struct *cwq) +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_enter_idle(struct worker *worker) { - spin_lock_irq(&cwq->lock); - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(worker->flags & WORKER_IDLE); + BUG_ON(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev)); + + /* can't use worker_set_flags(), also called from start_worker() */ + worker->flags |= WORKER_IDLE; + gcwq->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &gcwq->idle_list); + + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); + + /* sanity check nr_running */ + WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + atomic_read(get_gcwq_nr_running(gcwq->cpu))); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(!(worker->flags & WORKER_IDLE)); + worker_clr_flags(worker, WORKER_IDLE); + gcwq->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * @worker: self + * + * Works which are scheduled while the cpu is online must at least be + * scheduled to a worker which is bound to the cpu so that if they are + * flushed from cpu callbacks while cpu is going down, they are + * guaranteed to execute on the cpu. + * + * This function is to be used by rogue workers and rescuers to bind + * themselves to the target cpu and may race with cpu going down or + * coming online. kthread_bind() can't be used because it may put the + * worker to already dead cpu and set_cpus_allowed_ptr() can't be used + * verbatim as it's best effort and blocking and gcwq may be + * [dis]associated in the meantime. + * + * This function tries set_cpus_allowed() and locks gcwq and verifies + * the binding against GCWQ_DISASSOCIATED which is set during + * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters + * idle state or fetches works without dropping lock, it can guarantee + * the scheduling requirement described in the first paragraph. + * + * CONTEXT: + * Might sleep. Called without any lock but returns with gcwq->lock + * held. + * + * RETURNS: + * %true if the associated gcwq is online (@worker is successfully + * bound), %false if offline. + */ +static bool worker_maybe_bind_and_lock(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; + + while (true) { /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. + * The following call may fail, succeed or succeed + * without actually migrating the task to the cpu if + * it races with cpu hotunplug operation. Verify + * against GCWQ_DISASSOCIATED. */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - trace_workqueue_execution(cwq->thread, work); - debug_work_deactivate(work); - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + + spin_lock_irq(&gcwq->lock); + if (gcwq->flags & GCWQ_DISASSOCIATED) + return false; + if (task_cpu(task) == gcwq->cpu && + cpumask_equal(¤t->cpus_allowed, + get_cpu_mask(gcwq->cpu))) + return true; + spin_unlock_irq(&gcwq->lock); + + /* CPU has come up inbetween, retry migration */ + cpu_relax(); + } +} + +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + +static struct worker *alloc_worker(void) +{ + struct worker *worker; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (worker) { + INIT_LIST_HEAD(&worker->entry); + INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); + /* on creation a worker is in !idle && prep state */ + worker->flags = WORKER_PREP; + } + return worker; +} + +/** + * create_worker - create a new workqueue worker + * @gcwq: gcwq the new worker will belong to + * @bind: whether to set affinity to @cpu or not + * + * Create a new worker which is bound to @gcwq. The returned worker + * can be started by calling start_worker() or destroyed using + * destroy_worker(). + * + * CONTEXT: + * Might sleep. Does GFP_KERNEL allocations. + * + * RETURNS: + * Pointer to the newly created worker. + */ +static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +{ + bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker *worker = NULL; + int id = -1; + + spin_lock_irq(&gcwq->lock); + while (ida_get_new(&gcwq->worker_ida, &id)) { + spin_unlock_irq(&gcwq->lock); + if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + goto fail; + spin_lock_irq(&gcwq->lock); + } + spin_unlock_irq(&gcwq->lock); + + worker = alloc_worker(); + if (!worker) + goto fail; + + worker->gcwq = gcwq; + worker->id = id; + + if (!on_unbound_cpu) + worker->task = kthread_create(worker_thread, worker, + "kworker/%u:%d", gcwq->cpu, id); + else + worker->task = kthread_create(worker_thread, worker, + "kworker/u:%d", id); + if (IS_ERR(worker->task)) + goto fail; + + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ + if (bind && !on_unbound_cpu) + kthread_bind(worker->task, gcwq->cpu); + else { + worker->task->flags |= PF_THREAD_BOUND; + if (on_unbound_cpu) + worker->flags |= WORKER_UNBOUND; + } + + return worker; +fail: + if (id >= 0) { + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); + spin_unlock_irq(&gcwq->lock); + } + kfree(worker); + return NULL; +} + +/** + * start_worker - start a newly created worker + * @worker: worker to start + * + * Make the gcwq aware of @worker and start it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void start_worker(struct worker *worker) +{ + worker->flags |= WORKER_STARTED; + worker->gcwq->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); +} + +/** + * destroy_worker - destroy a workqueue worker + * @worker: worker to be destroyed + * + * Destroy @worker and adjust @gcwq stats accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void destroy_worker(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + int id = worker->id; + + /* sanity check frenzy */ + BUG_ON(worker->current_work); + BUG_ON(!list_empty(&worker->scheduled)); + + if (worker->flags & WORKER_STARTED) + gcwq->nr_workers--; + if (worker->flags & WORKER_IDLE) + gcwq->nr_idle--; + + list_del_init(&worker->entry); + worker->flags |= WORKER_DIE; + + spin_unlock_irq(&gcwq->lock); + + kthread_stop(worker->task); + kfree(worker); + + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); +} + +static void idle_worker_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + + spin_lock_irq(&gcwq->lock); + + if (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + /* idle_list is kept in LIFO order, check the last one */ + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) + mod_timer(&gcwq->idle_timer, expires); + else { + /* it's been idle for too long, wake up manager */ + gcwq->flags |= GCWQ_MANAGE_WORKERS; + wake_up_worker(gcwq); + } + } + + spin_unlock_irq(&gcwq->lock); +} + +static bool send_mayday(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct workqueue_struct *wq = cwq->wq; + unsigned int cpu; + + if (!(wq->flags & WQ_RESCUER)) + return false; + + /* mayday mayday mayday */ + cpu = cwq->gcwq->cpu; + /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ + if (cpu == WORK_CPU_UNBOUND) + cpu = 0; + if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + wake_up_process(wq->rescuer->task); + return true; +} + +static void gcwq_mayday_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + struct work_struct *work; + + spin_lock_irq(&gcwq->lock); + + if (need_to_create_worker(gcwq)) { + /* + * We've been trying to create a new worker but + * haven't been successful. We might be hitting an + * allocation deadlock. Send distress signals to + * rescuers. + */ + list_for_each_entry(work, &gcwq->worklist, entry) + send_mayday(work); + } + + spin_unlock_irq(&gcwq->lock); + + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); +} + +/** + * maybe_create_worker - create a new worker if necessary + * @gcwq: gcwq to create a new worker for + * + * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * have at least one idle worker on return from this function. If + * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is + * sent to all rescuers with works scheduled on @gcwq to resolve + * possible allocation deadlock. + * + * On return, need_to_create_worker() is guaranteed to be false and + * may_start_working() true. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_create_worker(struct global_cwq *gcwq) +{ + if (!need_to_create_worker(gcwq)) + return false; +restart: + spin_unlock_irq(&gcwq->lock); + + /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + + while (true) { + struct worker *worker; + + worker = create_worker(gcwq, true); + if (worker) { + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + BUG_ON(need_to_create_worker(gcwq)); + return true; } - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; + if (!need_to_create_worker(gcwq)) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CREATE_COOLDOWN); + + if (!need_to_create_worker(gcwq)) + break; } - spin_unlock_irq(&cwq->lock); + + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) + goto restart; + return true; } -static int worker_thread(void *__cwq) +/** + * maybe_destroy_worker - destroy workers which have been idle for a while + * @gcwq: gcwq to destroy workers for + * + * Destroy @gcwq workers which have been idle for longer than + * IDLE_WORKER_TIMEOUT. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Called only from manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_destroy_workers(struct global_cwq *gcwq) { - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); + bool ret = false; - if (cwq->wq->freezeable) - set_freezable(); + while (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; - try_to_freeze(); + if (time_before(jiffies, expires)) { + mod_timer(&gcwq->idle_timer, expires); + break; + } - if (kthread_should_stop()) + destroy_worker(worker); + ret = true; + } + + return ret; +} + +/** + * manage_workers - manage worker pool + * @worker: self + * + * Assume the manager role and manage gcwq worker pool @worker belongs + * to. At any given time, there can be only zero or one manager per + * gcwq. The exclusion is handled automatically by this function. + * + * The caller can safely start processing works on false return. On + * true return, it's guaranteed that need_to_create_worker() is false + * and may_start_working() is true. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true if + * some action was taken. + */ +static bool manage_workers(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + bool ret = false; + + if (gcwq->flags & GCWQ_MANAGING_WORKERS) + return ret; + + gcwq->flags &= ~GCWQ_MANAGE_WORKERS; + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + /* + * Destroy and then create so that may_start_working() is true + * on return. + */ + ret |= maybe_destroy_workers(gcwq); + ret |= maybe_create_worker(gcwq); + + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + + return ret; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + move_linked_works(work, pos, NULL); + cwq->nr_active++; +} - run_workqueue(cwq); +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + cwq->nr_active--; + + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); } - return 0; + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * process_one_work - process single work + * @worker: self + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct worker *worker, struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct global_cwq *gcwq = cwq->gcwq; + struct hlist_head *bwh = busy_worker_head(gcwq, work); + bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + work_func_t f = work->func; + int work_color; + struct worker *collision; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* + * A single work shouldn't be executed concurrently by + * multiple workers on a single cpu. Check whether anyone is + * already processing the work. If so, defer the work to the + * currently executing one. + */ + collision = __find_worker_executing_work(gcwq, bwh, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, NULL); + return; + } + + /* claim and process */ + debug_work_deactivate(work); + hlist_add_head(&worker->hentry, bwh); + worker->current_work = work; + worker->current_cwq = cwq; + work_color = get_work_color(work); + + /* record the current cpu number in the work data and dequeue */ + set_work_cpu(work, gcwq->cpu); + list_del_init(&work->entry); + + /* + * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, + * wake up another worker; otherwise, clear HIGHPRI_PENDING. + */ + if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + struct work_struct *nwork = list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (!list_empty(&gcwq->worklist) && + get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) + wake_up_worker(gcwq); + else + gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + } + + /* + * CPU intensive works don't participate in concurrency + * management. They're the scheduler's responsibility. + */ + if (unlikely(cpu_intensive)) + worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + + spin_unlock_irq(&gcwq->lock); + + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&gcwq->lock); + + /* clear cpu intensive status */ + if (unlikely(cpu_intensive)) + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + + /* we're done with it, release */ + hlist_del_init(&worker->hentry); + worker->current_work = NULL; + worker->current_cwq = NULL; + cwq_dec_nr_in_flight(cwq, work_color); +} + +/** + * process_scheduled_works - process scheduled works + * @worker: self + * + * Process all scheduled works. Please note that the scheduled list + * may change while processing a work, so this function repeatedly + * fetches a work from the top and executes it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. + */ +static void process_scheduled_works(struct worker *worker) +{ + while (!list_empty(&worker->scheduled)) { + struct work_struct *work = list_first_entry(&worker->scheduled, + struct work_struct, entry); + process_one_work(worker, work); + } +} + +/** + * worker_thread - the worker thread function + * @__worker: self + * + * The gcwq worker thread function. There's a single dynamic pool of + * these per each cpu. These workers process all works regardless of + * their specific target workqueue. The only exception is works which + * belong to workqueues with a rescuer which will be explained in + * rescuer_thread(). + */ +static int worker_thread(void *__worker) +{ + struct worker *worker = __worker; + struct global_cwq *gcwq = worker->gcwq; + + /* tell the scheduler that this is a workqueue worker */ + worker->task->flags |= PF_WQ_WORKER; +woke_up: + spin_lock_irq(&gcwq->lock); + + /* DIE can be set only while we're idle, checking here is enough */ + if (worker->flags & WORKER_DIE) { + spin_unlock_irq(&gcwq->lock); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + worker_leave_idle(worker); +recheck: + /* no more worker necessary? */ + if (!need_more_worker(gcwq)) + goto sleep; + + /* do we need to manage? */ + if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * ->scheduled list can only be filled while a worker is + * preparing to process a work or actually processing it. + * Make sure nobody diddled with it while I was sleeping. + */ + BUG_ON(!list_empty(&worker->scheduled)); + + /* + * When control reaches this point, we're guaranteed to have + * at least one idle worker or that someone else has already + * assumed the manager role. + */ + worker_clr_flags(worker, WORKER_PREP); + + do { + struct work_struct *work = + list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { + /* optimization path, not strictly necessary */ + process_one_work(worker, work); + if (unlikely(!list_empty(&worker->scheduled))) + process_scheduled_works(worker); + } else { + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); + } + } while (keep_working(gcwq)); + + worker_set_flags(worker, WORKER_PREP, false); +sleep: + if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * gcwq->lock is held and there's no work to process and no + * need to manage, sleep. Workers are woken up only while + * holding gcwq->lock or from local cpu, so setting the + * current state before releasing gcwq->lock is enough to + * prevent losing any event. + */ + worker_enter_idle(worker); + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&gcwq->lock); + schedule(); + goto woke_up; +} + +/** + * rescuer_thread - the rescuer thread function + * @__wq: the associated workqueue + * + * Workqueue rescuer thread function. There's one rescuer for each + * workqueue which has WQ_RESCUER set. + * + * Regular work processing on a gcwq may block trying to create a new + * worker which uses GFP_KERNEL allocation which has slight chance of + * developing into deadlock if some works currently on the same queue + * need to be processed to satisfy the GFP_KERNEL allocation. This is + * the problem rescuer solves. + * + * When such condition is possible, the gcwq summons rescuers of all + * workqueues which have works queued on the gcwq and let them process + * those works so that forward progress can be guaranteed. + * + * This should happen rarely. + */ +static int rescuer_thread(void *__wq) +{ + struct workqueue_struct *wq = __wq; + struct worker *rescuer = wq->rescuer; + struct list_head *scheduled = &rescuer->scheduled; + bool is_unbound = wq->flags & WQ_UNBOUND; + unsigned int cpu; + + set_user_nice(current, RESCUER_NICE_LEVEL); +repeat: + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + /* + * See whether any cpu is asking for help. Unbounded + * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. + */ + for_each_mayday_cpu(cpu, wq->mayday_mask) { + unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; + struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + struct work_struct *work, *n; + + __set_current_state(TASK_RUNNING); + mayday_clear_cpu(cpu, wq->mayday_mask); + + /* migrate to the target cpu if possible */ + rescuer->gcwq = gcwq; + worker_maybe_bind_and_lock(rescuer); + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + BUG_ON(!list_empty(&rescuer->scheduled)); + list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + if (get_work_cwq(work) == cwq) + move_linked_works(work, scheduled, &n); + + process_scheduled_works(rescuer); + spin_unlock_irq(&gcwq->lock); + } + + schedule(); + goto repeat; } struct wq_barrier { @@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work) complete(&barr->done); } +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @target: target work to attach @barr to + * @worker: worker currently executing @target, NULL if @target is not executing + * + * @barr is linked to @target such that @barr is completed only after + * @target finishes execution. Please note that the ordering + * guarantee is observed only with respect to @target and on the local + * cpu. + * + * Currently, a queued barrier can't be canceled. This is because + * try_to_grab_pending() can't determine whether the work to be + * grabbed is at the head of the queue and thus can't clear LINKED + * flag of the previous work while there must be a valid next work + * after a work with LINKED flag set. + * + * Note that when @worker is non-NULL, @target may be modified + * underneath us, so we can't reliably determine cwq from @target. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) + struct wq_barrier *barr, + struct work_struct *target, struct worker *worker) { + struct list_head *head; + unsigned int linked = 0; + /* - * debugobject calls are safe here even with cwq->lock locked + * debugobject calls are safe here even with gcwq->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + /* + * If @target is currently being executed, schedule the + * barrier to the worker; otherwise, put it after @target. + */ + if (worker) + head = worker->scheduled.next; + else { + unsigned long *bits = work_data_bits(target); + + head = target->entry.next; + /* there can already be other linked works, inherit and set */ + linked = *bits & WORK_STRUCT_LINKED; + __set_bit(WORK_STRUCT_LINKED_BIT, bits); + } + debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head); + insert_work(cwq, &barr->work, head, + work_color_to_flags(WORK_NO_COLOR) | linked); } -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +/** + * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * @wq: workqueue being flushed + * @flush_color: new flush color, < 0 for no-op + * @work_color: new work color, < 0 for no-op + * + * Prepare cwqs for workqueue flushing. + * + * If @flush_color is non-negative, flush_color on all cwqs should be + * -1. If no cwq has in-flight commands at the specified color, all + * cwq->flush_color's stay at -1 and %false is returned. If any cwq + * has in flight commands, its cwq->flush_color is set to + * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * wakeup logic is armed and %true is returned. + * + * The caller should have initialized @wq->first_flusher prior to + * calling this function with non-negative @flush_color. If + * @flush_color is negative, no flush color update is done and %false + * is returned. + * + * If @work_color is non-negative, all cwqs should have the same + * work_color which is previous to @work_color and all will be + * advanced to @work_color. + * + * CONTEXT: + * mutex_lock(wq->flush_mutex). + * + * RETURNS: + * %true if @flush_color >= 0 and there's something to flush. %false + * otherwise. + */ +static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, + int flush_color, int work_color) { - int active = 0; - struct wq_barrier barr; + bool wait = false; + unsigned int cpu; - WARN_ON(cwq->thread == current); - - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; + if (flush_color >= 0) { + BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); + atomic_set(&wq->nr_cwqs_to_flush, 1); } - spin_unlock_irq(&cwq->lock); - if (active) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + + spin_lock_irq(&gcwq->lock); + + if (flush_color >= 0) { + BUG_ON(cwq->flush_color != -1); + + if (cwq->nr_in_flight[flush_color]) { + cwq->flush_color = flush_color; + atomic_inc(&wq->nr_cwqs_to_flush); + wait = true; + } + } + + if (work_color >= 0) { + BUG_ON(work_color != work_next_color(cwq->work_color)); + cwq->work_color = work_color; + } + + spin_unlock_irq(&gcwq->lock); } - return active; + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + complete(&wq->first_flusher->done); + + return wait; } /** @@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * * We sleep until all works which were queued on entry have been handled, * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. */ void flush_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + struct wq_flusher this_flusher = { + .list = LIST_HEAD_INIT(this_flusher.list), + .flush_color = -1, + .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + }; + int next_color; - might_sleep(); lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - for_each_cpu(cpu, cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); + + mutex_lock(&wq->flush_mutex); + + /* + * Start-to-wait phase + */ + next_color = work_next_color(wq->work_color); + + if (next_color != wq->flush_color) { + /* + * Color space is not full. The current work_color + * becomes our flush_color and work_color is advanced + * by one. + */ + BUG_ON(!list_empty(&wq->flusher_overflow)); + this_flusher.flush_color = wq->work_color; + wq->work_color = next_color; + + if (!wq->first_flusher) { + /* no flush in progress, become the first flusher */ + BUG_ON(wq->flush_color != this_flusher.flush_color); + + wq->first_flusher = &this_flusher; + + if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + wq->work_color)) { + /* nothing to flush, done */ + wq->flush_color = next_color; + wq->first_flusher = NULL; + goto out_unlock; + } + } else { + /* wait in queue */ + BUG_ON(wq->flush_color == this_flusher.flush_color); + list_add_tail(&this_flusher.list, &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + } else { + /* + * Oops, color space is full, wait on overflow queue. + * The next flush completion will assign us + * flush_color and transfer to flusher_queue. + */ + list_add_tail(&this_flusher.list, &wq->flusher_overflow); + } + + mutex_unlock(&wq->flush_mutex); + + wait_for_completion(&this_flusher.done); + + /* + * Wake-up-and-cascade phase + * + * First flushers are responsible for cascading flushes and + * handling overflow. Non-first flushers can simply return. + */ + if (wq->first_flusher != &this_flusher) + return; + + mutex_lock(&wq->flush_mutex); + + /* we might have raced, check again with mutex held */ + if (wq->first_flusher != &this_flusher) + goto out_unlock; + + wq->first_flusher = NULL; + + BUG_ON(!list_empty(&this_flusher.list)); + BUG_ON(wq->flush_color != this_flusher.flush_color); + + while (true) { + struct wq_flusher *next, *tmp; + + /* complete all the flushers sharing the current flush color */ + list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { + if (next->flush_color != wq->flush_color) + break; + list_del_init(&next->list); + complete(&next->done); + } + + BUG_ON(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); + + /* this flush_color is finished, advance by one */ + wq->flush_color = work_next_color(wq->flush_color); + + /* one color has been freed, handle overflow queue */ + if (!list_empty(&wq->flusher_overflow)) { + /* + * Assign the same color to all overflowed + * flushers, advance work_color and append to + * flusher_queue. This is the start-to-wait + * phase for these overflowed flushers. + */ + list_for_each_entry(tmp, &wq->flusher_overflow, list) + tmp->flush_color = wq->work_color; + + wq->work_color = work_next_color(wq->work_color); + + list_splice_tail_init(&wq->flusher_overflow, + &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + + if (list_empty(&wq->flusher_queue)) { + BUG_ON(wq->flush_color != wq->work_color); + break; + } + + /* + * Need to flush more colors. Make the next flusher + * the new first flusher and arm cwqs. + */ + BUG_ON(wq->flush_color == wq->work_color); + BUG_ON(wq->flush_color != next->flush_color); + + list_del_init(&next->list); + wq->first_flusher = next; + + if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + break; + + /* + * Meh... this color is already done, clear first + * flusher and repeat cascading. + */ + wq->first_flusher = NULL; + } + +out_unlock: + mutex_unlock(&wq->flush_mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); */ int flush_work(struct work_struct *work) { + struct worker *worker = NULL; + struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct list_head *prev; struct wq_barrier barr; might_sleep(); - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return 0; - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. + * If it was re-queued to a different gcwq under us, we + * are not going to wait. */ smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; + cwq = get_work_cwq(work); + if (unlikely(!cwq || gcwq != cwq->gcwq)) + goto already_gone; } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; + worker = find_worker_executing_work(gcwq, work); + if (!worker) + goto already_gone; + cwq = worker->current_cwq; } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; + + insert_wq_barrier(cwq, &barr, work, worker); + spin_unlock_irq(&gcwq->lock); + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return 1; +already_gone: + spin_unlock_irq(&gcwq->lock); + return 0; } EXPORT_SYMBOL_GPL(flush_work); @@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work); */ static int try_to_grab_pending(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; + struct global_cwq *gcwq; int ret = -1; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return ret; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* - * This work is queued, but perhaps we locked the wrong cwq. + * This work is queued, but perhaps we locked the wrong gcwq. * In that case we must see the new value after rmb(), see * insert_work()->wmb(). */ smp_rmb(); - if (cwq == get_wq_data(work)) { + if (gcwq == get_work_gcwq(work)) { debug_work_deactivate(work); list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work)); ret = 1; } } - spin_unlock_irq(&cwq->lock); + spin_unlock_irq(&gcwq->lock); return ret; } -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) +static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) { struct wq_barrier barr; - int running = 0; + struct worker *worker; - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); - if (unlikely(running)) { + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); } @@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, static void wait_on_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu(cpu, cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + for_each_gcwq_cpu(cpu) + wait_on_cpu_work(get_gcwq(cpu), work); } static int __cancel_work_timer(struct work_struct *work, @@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - clear_wq_data(work); + clear_work_data(work); return ret; } @@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -static struct workqueue_struct *keventd_wq __read_mostly; - /** * schedule_work - put work task in global workqueue * @work: job to be done @@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; */ int schedule_work(struct work_struct *work) { - return queue_work(keventd_wq, work); + return queue_work(system_wq, work); } EXPORT_SYMBOL(schedule_work); @@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work); */ int schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, keventd_wq, work); + return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); @@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on); int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, dwork, delay); + return queue_delayed_work(system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work); void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { - struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); - __queue_work(cwq, &dwork->work); + __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, + &dwork->work); put_cpu(); } flush_work(&dwork->work); @@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); @@ -820,8 +2568,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; - int orig = -1; - struct work_struct *works; + struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) @@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func) get_online_cpus(); - /* - * When running in keventd don't schedule a work item on - * itself. Can just call directly because the work queue is - * already bound. This also is faster. - */ - if (current_is_keventd()) - orig = raw_smp_processor_id(); - for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - if (cpu != orig) - schedule_work_on(cpu, work); + schedule_work_on(cpu, work); } - if (orig >= 0) - func(per_cpu_ptr(works, orig)); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); @@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func) */ void flush_scheduled_work(void) { - flush_workqueue(keventd_wq); + flush_workqueue(system_wq); } EXPORT_SYMBOL(flush_scheduled_work); @@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); int keventd_up(void) { - return keventd_wq != NULL; + return system_wq != NULL; } -int current_is_keventd(void) +static int alloc_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long. + */ + const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, + __alignof__(unsigned long long)); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; + if (percpu) + wq->cpu_wq.pcpu = __alloc_percpu(size, align); + else { + void *ptr; - return ret; + /* + * Allocate enough room to align cwq and put an extra + * pointer at the end pointing back to the originally + * allocated pointer which will be used for free. + */ + ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); + if (ptr) { + wq->cpu_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->cpu_wq.single + 1) = ptr; + } + } + /* just in case, make sure it's actually aligned */ + BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); + return wq->cpu_wq.v ? 0 : -ENOMEM; } -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +static void free_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - return cwq; + if (percpu) + free_percpu(wq->cpu_wq.pcpu); + else if (wq->cpu_wq.single) { + /* the pointer to free is stored right after the cwq */ + kfree(*(void **)(wq->cpu_wq.single + 1)); + } } -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static int wq_clamp_max_active(int max_active, unsigned int flags, + const char *name) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; + int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - cwq->thread = p; + if (max_active < 1 || max_active > lim) + printk(KERN_WARNING "workqueue: max_active %d requested for %s " + "is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); - trace_workqueue_creation(cwq->thread, cpu); - - return 0; + return clamp_val(max_active, 1, lim); } -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +struct workqueue_struct *__alloc_workqueue_key(const char *name, + unsigned int flags, + int max_active, + struct lock_class_key *key, + const char *lock_name) { - struct task_struct *p = cwq->thread; + struct workqueue_struct *wq; + unsigned int cpu; - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} + /* + * Unbound workqueues aren't concurrency managed and should be + * dispatched to workers immediately. + */ + if (flags & WQ_UNBOUND) + flags |= WQ_HIGHPRI; -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, name); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) - return NULL; + goto err; - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + wq->flags = flags; + wq->saved_max_active = max_active; + mutex_init(&wq->flush_mutex); + atomic_set(&wq->nr_cwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->flusher_queue); + INIT_LIST_HEAD(&wq->flusher_overflow); wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; INIT_LIST_HEAD(&wq->list); - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); + if (alloc_cwqs(wq) < 0) + goto err; + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = get_gcwq(cpu); + + BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); + cwq->gcwq = gcwq; + cwq->wq = wq; + cwq->flush_color = -1; + cwq->max_active = max_active; + INIT_LIST_HEAD(&cwq->delayed_works); } - if (err) { - destroy_workqueue(wq); - wq = NULL; + if (flags & WQ_RESCUER) { + struct worker *rescuer; + + if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) + goto err; + + wq->rescuer = rescuer = alloc_worker(); + if (!rescuer) + goto err; + + rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + if (IS_ERR(rescuer->task)) + goto err; + + wq->rescuer = rescuer; + rescuer->task->flags |= PF_THREAD_BOUND; + wake_up_process(rescuer->task); } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. + * workqueue_lock protects global freeze state and workqueues + * list. Grab it, set max_active accordingly and add the new + * workqueue to workqueues list. */ - if (cwq->thread == NULL) - return; + spin_lock(&workqueue_lock); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + for_each_cwq_cpu(cpu, wq) + get_cwq(cpu, wq)->max_active = 0; - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - trace_workqueue_destruction(cwq->thread); - kthread_stop(cwq->thread); - cwq->thread = NULL; + list_add(&wq->list, &workqueues); + + spin_unlock(&workqueue_lock); + + return wq; +err: + if (wq) { + free_cwqs(wq); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + kfree(wq); + } + return NULL; } +EXPORT_SYMBOL_GPL(__alloc_workqueue_key); /** * destroy_workqueue - safely terminate a workqueue @@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + unsigned int cpu; - cpu_maps_update_begin(); + flush_workqueue(wq); + + /* + * wq list is used to freeze wq, remove from list after + * flushing is complete in case freeze races us. + */ spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu(cpu, cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); + /* sanity check */ + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + BUG_ON(cwq->nr_in_flight[i]); + BUG_ON(cwq->nr_active); + BUG_ON(!list_empty(&cwq->delayed_works)); + } + + if (wq->flags & WQ_RESCUER) { + kthread_stop(wq->rescuer->task); + free_mayday_mask(wq->mayday_mask); + } - free_percpu(wq->cpu_wq); + free_cwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * workqueue_set_max_active - adjust max_active of a workqueue + * @wq: target workqueue + * @max_active: new max_active value. + * + * Set max_active of @wq to @max_active. + * + * CONTEXT: + * Don't call from IRQ context. + */ +void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) +{ + unsigned int cpu; + + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); + + spin_lock(&workqueue_lock); + + wq->saved_max_active = max_active; + + for_each_cwq_cpu(cpu, wq) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_irq(&gcwq->lock); + + if (!(wq->flags & WQ_FREEZEABLE) || + !(gcwq->flags & GCWQ_FREEZING)) + get_cwq(gcwq->cpu, wq)->max_active = max_active; + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); + +/** + * workqueue_congested - test whether a workqueue is congested + * @cpu: CPU in question + * @wq: target workqueue + * + * Test whether @wq's cpu workqueue for @cpu is congested. There is + * no synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * + * RETURNS: + * %true if congested, %false otherwise. + */ +bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +{ + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + return !list_empty(&cwq->delayed_works); +} +EXPORT_SYMBOL_GPL(workqueue_congested); + +/** + * work_cpu - return the last known associated cpu for @work + * @work: the work of interest + * + * RETURNS: + * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. + */ +unsigned int work_cpu(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + + return gcwq ? gcwq->cpu : WORK_CPU_NONE; +} +EXPORT_SYMBOL_GPL(work_cpu); + +/** + * work_busy - test whether a work is currently pending or running + * @work: the work to be tested + * + * Test whether @work is currently pending or running. There is no + * synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * Especially for reentrant wqs, the pending state might hide the + * running state. + * + * RETURNS: + * OR'd bitmask of WORK_BUSY_* bits. + */ +unsigned int work_busy(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long flags; + unsigned int ret = 0; + + if (!gcwq) + return false; + + spin_lock_irqsave(&gcwq->lock, flags); + + if (work_pending(work)) + ret |= WORK_BUSY_PENDING; + if (find_worker_executing_work(gcwq, work)) + ret |= WORK_BUSY_RUNNING; + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(work_busy); + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, cwq and + * gcwq which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, + * gcwqs serve mix of short, long and very long running works making + * blocked draining impractical. + * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- + */ + +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) + +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) + +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; + + BUG_ON(gcwq->cpu != smp_processor_id()); + + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); + + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; + + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); + + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); + spin_lock_irq(&gcwq->lock); + + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); + + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezeable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + int nr_works = 0; + + list_for_each_entry(work, &gcwq->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &gcwq->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(gcwq)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } + } + + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; + } + + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); + + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); + spin_unlock_irq(&gcwq->lock); + return 0; +} + +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int err = 0; + struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ case CPU_UP_PREPARE: - cpumask_set_cpu(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - err = create_workqueue_thread(cwq, cpu); - if (!err) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - err = -ENOMEM; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; } } + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + switch (action) { - case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_DYING: + /* + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + break; + case CPU_POST_DEAD: - cpumask_clear_cpu(cpu, cpu_populated_map); + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; } - return notifier_from_errno(err); + spin_unlock_irqrestore(&gcwq->lock, flags); + + return notifier_from_errno(0); } #ifdef CONFIG_SMP @@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) EXPORT_SYMBOL_GPL(work_on_cpu); #endif /* CONFIG_SMP */ -void __init init_workqueues(void) +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all + * freezeable workqueues will queue new works to their frozen_works + * list instead of gcwq->worklist. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void freeze_workqueues_begin(void) { - alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + unsigned int cpu; - cpumask_copy(cpu_populated_map, cpu_online_mask); - singlethread_cpu = cpumask_first(cpu_possible_mask); - cpu_singlethread_map = cpumask_of(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); + spin_lock(&workqueue_lock); + + BUG_ON(workqueue_freezing); + workqueue_freezing = true; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(gcwq->flags & GCWQ_FREEZING); + gcwq->flags |= GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (cwq && wq->flags & WQ_FREEZEABLE) + cwq->max_active = 0; + } + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} + +/** + * freeze_workqueues_busy - are freezeable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases workqueue_lock. + * + * RETURNS: + * %true if some freezeable workqueues are still busy. %false if + * freezing is complete. + */ +bool freeze_workqueues_busy(void) +{ + unsigned int cpu; + bool busy = false; + + spin_lock(&workqueue_lock); + + BUG_ON(!workqueue_freezing); + + for_each_gcwq_cpu(cpu) { + struct workqueue_struct *wq; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + BUG_ON(cwq->nr_active < 0); + if (cwq->nr_active) { + busy = true; + goto out_unlock; + } + } + } +out_unlock: + spin_unlock(&workqueue_lock); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective gcwq worklists. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void thaw_workqueues(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + if (!workqueue_freezing) + goto out_unlock; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); + gcwq->flags &= ~GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + /* restore max_active and repopulate worklist */ + cwq->max_active = wq->saved_max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + workqueue_freezing = false; +out_unlock: + spin_unlock(&workqueue_lock); +} +#endif /* CONFIG_FREEZER */ + +static int __init init_workqueues(void) +{ + unsigned int cpu; + int i; + + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_init(&gcwq->lock); + INIT_LIST_HEAD(&gcwq->worklist); + gcwq->cpu = cpu; + if (cpu == WORK_CPU_UNBOUND) + gcwq->flags |= GCWQ_DISASSOCIATED; + + INIT_LIST_HEAD(&gcwq->idle_list); + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) + INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + + init_timer_deferrable(&gcwq->idle_timer); + gcwq->idle_timer.function = idle_worker_timeout; + gcwq->idle_timer.data = (unsigned long)gcwq; + + setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + (unsigned long)gcwq); + + ida_init(&gcwq->worker_ida); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); + } + + /* create the initial worker */ + for_each_online_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + + worker = create_worker(gcwq, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } + + system_wq = alloc_workqueue("events", 0, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); + return 0; } +early_initcall(init_workqueues); diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 000000000000..2d10fc98dc79 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,9 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); |