diff options
Diffstat (limited to 'kernel')
136 files changed, 6950 insertions, 5096 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ac0d533eb7de..bbde5f1a4486 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ async.o range.o groups.o lglock.o smpboot.o @@ -25,9 +25,7 @@ endif obj-y += sched/ obj-y += power/ -ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) -obj-$(CONFIG_X86) += kcmp.o -endif +obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -54,7 +52,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -127,20 +125,32 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(obj)/time.o: $(obj)/timeconst.h -quiet_cmd_timeconst = TIMEC $@ - cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + targets += timeconst.h -$(obj)/timeconst.h: $(src)/timeconst.pl FORCE - $(call if_changed,timeconst) +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) # # Pull the signing certificate and any extra certificates into the kernel # + +quiet_cmd_touch = TOUCH $@ + cmd_touch = touch $@ + extra_certificates: - touch $@ + $(call cmd,touch) -kernel/modsign_pubkey.o: signing_key.x509 extra_certificates +kernel/modsign_certificate.o: signing_key.x509 extra_certificates ############################################################################### # @@ -149,23 +159,7 @@ kernel/modsign_pubkey.o: signing_key.x509 extra_certificates # fail and that the kernel may be used afterwards. # ############################################################################### -sign_key_with_hash := -ifeq ($(CONFIG_MODULE_SIG_SHA1),y) -sign_key_with_hash := -sha1 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA224),y) -sign_key_with_hash := -sha224 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA256),y) -sign_key_with_hash := -sha256 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA384),y) -sign_key_with_hash := -sha384 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA512),y) -sign_key_with_hash := -sha512 -endif -ifeq ($(sign_key_with_hash),) +ifndef CONFIG_MODULE_SIG_HASH $(error Could not determine digest type to use from kernel config) endif @@ -178,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "### needs to be run as root, and uses a hardware random" @echo "### number generator if one is available." @echo "###" - openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ - -x509 -config x509.genkey \ + openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ + -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ -keyout signing_key.priv @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..b9bd7f098ee5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + if (!S_ISREG(file_inode(file)->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -566,6 +566,7 @@ out: void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; + cputime_t utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { @@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; + task_cputime(current, &utime, &stime); + pacct->ac_utime += utime; + pacct->ac_stime += stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..8ddee2c3e5b0 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel. #include <linux/slab.h> #include <linux/workqueue.h> +#include "workqueue_internal.h" + static async_cookie_t next_cookie = 1; -#define MAX_WORK 32768 +#define MAX_WORK 32768 +#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ -static LIST_HEAD(async_pending); -static ASYNC_DOMAIN(async_running); -static LIST_HEAD(async_domains); +static LIST_HEAD(async_global_pending); /* pending from all registered doms */ +static ASYNC_DOMAIN(async_dfl_domain); static DEFINE_SPINLOCK(async_lock); -static DEFINE_MUTEX(async_register_mutex); struct async_entry { - struct list_head list; + struct list_head domain_list; + struct list_head global_list; struct work_struct work; async_cookie_t cookie; async_func_ptr *func; void *data; - struct async_domain *running; + struct async_domain *domain; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; - -/* - * MUST be called with the lock held! - */ -static async_cookie_t __lowest_in_progress(struct async_domain *running) +static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct async_entry *entry; - - if (!list_empty(&running->domain)) { - entry = list_first_entry(&running->domain, typeof(*entry), list); - return entry->cookie; - } + struct async_entry *first = NULL; + async_cookie_t ret = ASYNC_COOKIE_MAX; + unsigned long flags; - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) - return entry->cookie; + spin_lock_irqsave(&async_lock, flags); - return next_cookie; /* "infinity" value */ -} + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); + } -static async_cookie_t lowest_in_progress(struct async_domain *running) -{ - unsigned long flags; - async_cookie_t ret; + if (first) + ret = first->cookie; - spin_lock_irqsave(&async_lock, flags); - ret = __lowest_in_progress(running); spin_unlock_irqrestore(&async_lock, flags); return ret; } @@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; - struct async_domain *running = entry->running; - /* 1) move self to the running queue */ - spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, &running->domain); - spin_unlock_irqrestore(&async_lock, flags); - - /* 2) run (and print duration) */ + /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "calling %lli_%pF @ %i\n", (long long)entry->cookie, @@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work) (long long)ktime_to_ns(delta) >> 10); } - /* 3) remove self from the running queue */ + /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); - list_del(&entry->list); - if (running->registered && --running->count == 0) - list_del_init(&running->node); + list_del_init(&entry->domain_list); + list_del_init(&entry->global_list); - /* 4) free the entry */ + /* 3) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 5) wake up any waiters */ + /* 4) wake up any waiters */ wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a ptr(data, newcookie); return newcookie; } + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; - entry->running = running; + entry->domain = domain; spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ newcookie = entry->cookie = next_cookie++; - list_add_tail(&entry->list, &async_pending); - if (running->registered && running->count++ == 0) - list_add_tail(&running->node, &async_domains); + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); @@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a */ async_cookie_t async_schedule(async_func_ptr *ptr, void *data) { - return __async_schedule(ptr, data, &async_running); + return __async_schedule(ptr, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); @@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule); * async_schedule_domain - schedule a function for asynchronous execution within a certain domain * @ptr: function to execute asynchronously * @data: data pointer to pass to the function - * @running: running list for the domain + * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_domain() functions - * to wait within a certain synchronization domain rather than globally. - * A synchronization domain is specified via the running queue @running to use. - * Note: This function may be called from atomic or non-atomic contexts. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. A + * synchronization domain is specified via @domain. Note: This function + * may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct async_domain *running) + struct async_domain *domain) { - return __async_schedule(ptr, data, running); + return __async_schedule(ptr, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); @@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { - mutex_lock(&async_register_mutex); - do { - struct async_domain *domain = NULL; - - spin_lock_irq(&async_lock); - if (!list_empty(&async_domains)) - domain = list_first_entry(&async_domains, typeof(*domain), node); - spin_unlock_irq(&async_lock); - - async_synchronize_cookie_domain(next_cookie, domain); - } while (!list_empty(&async_domains)); - mutex_unlock(&async_register_mutex); + async_synchronize_full_domain(NULL); } EXPORT_SYMBOL_GPL(async_synchronize_full); @@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); */ void async_unregister_domain(struct async_domain *domain) { - mutex_lock(&async_register_mutex); spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->node) || - !list_empty(&domain->domain)); + WARN_ON(!domain->registered || !list_empty(&domain->pending)); domain->registered = 0; spin_unlock_irq(&async_lock); - mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_unregister_domain); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @domain: running list to synchronize on + * @domain: the domain to synchronize * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @domain have been done. + * synchronization domain specified by @domain have been done. */ void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, domain); + async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on + * @domain: the domain to synchronize (%NULL for all registered domains) * * This function waits until all asynchronous function calls for the - * synchronization domain specified by running list @running submitted - * prior to @cookie have been done. + * synchronization domain specified by @domain submitted prior to @cookie + * have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { ktime_t uninitialized_var(starttime), delta, endtime; - if (!running) - return; - if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } - wait_event(async_done, lowest_in_progress(running) >= cookie); + wait_event(async_done, lowest_in_progress(domain) >= cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); @@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); */ void async_synchronize_cookie(async_cookie_t cookie) { - async_synchronize_cookie_domain(cookie, &async_running); + async_synchronize_cookie_domain(cookie, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); + +/** + * current_is_async - is %current an async worker task? + * + * Returns %true if %current is an async worker task. + */ +bool current_is_async(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker->current_func == async_run_entry_fn; +} diff --git a/kernel/audit.c b/kernel/audit.c index 40414e9143db..d596e5355f15 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old, int rc = 0; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return rc; audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { @@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + if (unlikely(!*ab)) + return rc; audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), from_kuid(&init_user_ns, current_uid()), @@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } +/* + * Wait for auditd to drain the queue a little + */ +static void wait_for_auditd(unsigned long sleep_time) +{ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(sleep_time); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); +} + /* Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the tsk is a task that is currently in a @@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time - && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + unsigned long sleep_time; - /* Wait for auditd to drain the queue a little */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + sleep_time = timeout_start + audit_backlog_wait_time - + jiffies; + if ((long)sleep_time > 0) + wait_for_auditd(sleep_time); continue; } if (audit_rate_check() && printk_ratelimit()) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ed206fd88cca..642a89c4f3d6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -249,7 +249,7 @@ static void untag_chunk(struct node *p) list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); goto out; } @@ -291,7 +291,7 @@ static void untag_chunk(struct node *p) owner->root = new; spin_unlock(&hash_lock); spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; @@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(entry); return 0; } @@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_destroy_mark(chunk_entry); + fsnotify_destroy_mark(chunk_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_destroy_mark(old_entry); + fsnotify_destroy_mark(old_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ return 0; } +static void audit_log_remove_rule(struct audit_krule *rule) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); + audit_log_untrustedstring(ab, rule->tree->pathname); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; - struct audit_buffer *ab; list_for_each_entry_safe(rule, next, &tree->rules, rlist) { entry = container_of(rule, struct audit_entry, rule); @@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); - audit_log_format(ab, " dir="); - audit_log_untrustedstring(ab, rule->tree->pathname); - audit_log_key(ab, rule->filterkey); - audit_log_format(ab, " list=%d res=1", rule->listnr); - audit_log_end(ab); + audit_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9a9ae6e3d290..22831c4d369c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (audit_enabled) { struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; audit_log_format(ab, "auid=%u ses=%u op=", from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); @@ -350,7 +352,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } mutex_unlock(&audit_filter_mutex); - fsnotify_destroy_mark(&parent->mark); + fsnotify_destroy_mark(&parent->mark, audit_watch_group); } /* Get path information necessary for adding watches. */ @@ -457,7 +459,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) if (list_empty(&parent->watches)) { audit_get_parent(parent); - fsnotify_destroy_mark(&parent->mark); + fsnotify_destroy_mark(&parent->mark, audit_watch_group); audit_put_parent(parent); } } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f19f23d38a3..f9fc54bbe06f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * audit_receive_filter - apply all rules to the specified message type * @type: audit message type * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e3..a371f857a0a9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); + if (unlikely(!ab)) + return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); - if (!ab) - return; } break; } case AUDIT_MQ_OPEN: { @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); +} + +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + audit_log_task(ab); audit_log_format(ab, " reason="); audit_log_string(ab, reason); audit_log_format(ab, " sig=%ld", signr); @@ -2715,6 +2720,8 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + if (unlikely(!ab)) + return; audit_log_abend(ab, "memory violation", signr); audit_log_end(ab); } @@ -2723,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", signr); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + if (unlikely(!ab)) + return; + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_format(ab, " syscall=%ld", syscall); audit_log_format(ab, " compat=%d", is_compat_task()); audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f34c41bfaa37..a32f9432666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,7 @@ #include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> @@ -376,22 +376,18 @@ static int css_set_count; * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { int i; - int index; - unsigned long tmp = 0UL; + unsigned long key = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; + return key; } /* We don't maintain the lists running through each css_set to its @@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); + hash_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); + + /* + * We may not be holding cgroup_mutex, and if cgrp->count is + * dropped to 0 the cgroup can be destroyed at any time, hence + * rcu_read_lock is used to keep it alive. + */ + rcu_read_lock(); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + rcu_read_unlock(); kfree(link); } @@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; - struct hlist_node *node; struct css_set *cg; + unsigned long key; /* * Build the set of subsystem state objects that we want to see in the @@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set( } } - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cg, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -657,8 +660,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; - struct hlist_head *hhead; struct cg_cgroup_link *link; + unsigned long key; /* First see if we already have a cgroup group that matches * the desired set */ @@ -704,8 +707,8 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); + key = css_set_hash(res->subsys); + hash_add(css_set_table, &res->hlist, key); write_unlock(&css_set_lock); @@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_free_fn(struct work_struct *work) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); + struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->css_free(cgrp); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ + deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + + simple_xattrs_free(&cgrp->xattrs); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(cgrp); +} - simple_xattrs_free(&cgrp->xattrs); +static void cgroup_free_rcu(struct rcu_head *head) +{ + struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + + schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree_rcu(cgrp, rcu_head); + BUG_ON(!(cgroup_is_removed(cgrp))); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; @@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d) dput(parent); } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); + /* + * If we're doing cleanup due to failure of cgroup_create(), + * the corresponding @cfe may not exist. + */ list_for_each_entry(cfe, &cgrp->files, node) { struct dentry *d = cfe->dentry; @@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) list_del_init(&cfe->node); dput(d); - return 0; + break; } - return -ENOENT; } /** @@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; - synchronize_rcu(); return 0; } @@ -1333,7 +1345,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* See feature-removal-schedule.txt */ if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); @@ -1394,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -1598,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; + struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1651,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } + hash_for_each(css_set_table, i, cg, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1774,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), "cgroup_path() called without proper locking"); - if (!dentry || cgrp == dummytop) { + if (cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1983,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ss->attach(cgrp, &tset); } - synchronize_rcu(); out: if (retval) { for_each_subsys(root, ss) { @@ -2152,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* * step 5: success! and cleanup */ - synchronize_rcu(); retval = 0; out_put_css_set_refs: if (retval) { @@ -2638,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un */ static inline struct cftype *__file_cft(struct file *file) { - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); return __d_cft(file->f_dentry); } @@ -2770,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) continue; - if (is_add) + if (is_add) { err = cgroup_add_file(cgrp, subsys, cft); - else - err = cgroup_rm_file(cgrp, cft); - if (err) { - pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", - is_add ? "add" : "remove", cft->name, err); + if (err) + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", + cft->name, err); ret = err; + } else { + cgroup_rm_file(cgrp, cft); } } return ret; @@ -3018,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, } EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos. If there's no descendant, + * @pos is returned. This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ + struct cgroup *last, *tmp; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + last = pos; + /* ->prev isn't RCU safe, walk ->next till the end */ + pos = NULL; + list_for_each_entry_rcu(tmp, &last->children, sibling) + pos = tmp; + } while (pos); + + return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); + static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) { struct cgroup *last; @@ -3409,7 +3440,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case @@ -3753,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; + remove_wait_queue(event->wqh, &event->wait); + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + eventfd_ctx_put(event->eventfd); kfree(event); dput(cgrp->dentry); @@ -3774,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - __remove_wait_queue(event->wqh, &event->wait); - spin_lock(&cgrp->event_list_lock); - list_del_init(&event->list); - spin_unlock(&cgrp->event_list_lock); /* - * We are in atomic context, but cgroup_event_remove() may - * sleep, so we have to call it in workqueue. + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. */ - schedule_work(&event->remove); + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); } return 0; @@ -3808,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { struct cgroup_event *event = NULL; + struct cgroup *cgrp_cfile; unsigned int efd, cfd; struct file *efile = NULL; struct file *cfile = NULL; @@ -3853,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); + ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) goto fail; @@ -3863,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * The file to be monitored must be in the same cgroup as + * cgroup.event_control is. + */ + cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); + if (cgrp_cfile != cgrp) { + ret = -EINVAL; + goto fail; + } + if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto fail; @@ -4136,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_housekeeping(cgrp); + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + cgrp->parent = parent; cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; @@ -4173,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; @@ -4341,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. Use - * a temporary list to avoid a deadlock with cgroup_event_wake(). Since - * cgroup_event_wake() is called with the wait queue head locked, - * remove_wait_queue() cannot be called while holding event_list_lock. + * directory to avoid race between userspace and kernelspace. */ spin_lock(&cgrp->event_list_lock); - list_splice_init(&cgrp->event_list, &tmp_list); - spin_unlock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { list_del_init(&event->list); - remove_wait_queue(event->wqh, &event->wait); - eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } + spin_unlock(&cgrp->event_list_lock); return 0; } @@ -4439,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; int i, ret; + struct hlist_node *tmp; + struct css_set *cg; + unsigned long key; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4504,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct css_set *cg; - struct hlist_node *node, *tmp; - struct hlist_head *bucket = &css_set_table[i], *new_bucket; - - hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { - /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hlist_del(&cg->hlist); - /* set new value */ - cg->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - new_bucket = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, new_bucket); - } + hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hash_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); @@ -4552,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cg_cgroup_link *link; - struct hlist_head *hhead; BUG_ON(ss->module == NULL); @@ -4568,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(ss, dummytop); ss->active = 0; - if (ss->use_id) { - idr_remove_all(&ss->idr); + if (ss->use_id) idr_destroy(&ss->idr); - } /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; @@ -4586,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_lock(&css_set_lock); list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { struct css_set *cg = link->cg; + unsigned long key; - hlist_del(&cg->hlist); + hash_del(&cg->hlist); cg->subsys[ss->subsys_id] = NULL; - hhead = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, hhead); + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); @@ -4632,9 +4679,6 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -4668,7 +4712,7 @@ int __init cgroup_init(void) { int err; int i; - struct hlist_head *hhead; + unsigned long key; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -4687,8 +4731,8 @@ int __init cgroup_init(void) } /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); + key = css_set_hash(init_css_set.subsys); + hash_add(css_set_table, &init_css_set.hlist, key); BUG_ON(!init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4983,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); + put_css_set_taskexit(cg); } /** @@ -5275,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id); static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) { struct css_id *newid; - int myid, error, size; + int ret, size; BUG_ON(!ss->use_id); @@ -5283,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) newid = kzalloc(size, GFP_KERNEL); if (!newid) return ERR_PTR(-ENOMEM); - /* get id */ - if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { - error = -ENOMEM; - goto err_out; - } + + idr_preload(GFP_KERNEL); spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ - error = idr_get_new_above(&ss->idr, newid, 1, &myid); + ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); spin_unlock(&ss->id_lock); + idr_preload_end(); /* Returns error when there are no free spaces for new ID.*/ - if (error) { - error = -ENOSPC; + if (ret < 0) goto err_out; - } - if (myid > CSS_ID_MAX) - goto remove_idr; - newid->id = myid; + newid->id = ret; newid->depth = depth; return newid; -remove_idr: - error = -ENOSPC; - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); err_out: kfree(newid); - return ERR_PTR(error); + return ERR_PTR(ret); } @@ -5442,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; - inode = f->f_dentry->d_inode; + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..19971d8c7299 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) { struct itimerval kit; int error; @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which, return error; } -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) { struct itimerval kin, kout; int error; @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) memcpy(blocked->sig, &set, sizeof(set)); } -asmlinkage long compat_sys_sigprocmask(int how, - compat_old_sigset_t __user *nset, - compat_old_sigset_t __user *oset) +COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, + compat_old_sigset_t __user *, nset, + compat_old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; @@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) return 0; } -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) { if (!ru) { return sys_wait4(pid, stat_addr, options, NULL); @@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, } } -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, uinfo, int, options, + struct compat_rusage __user *, uru) { siginfo_t info; struct rusage ru; @@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, return ret; if (uru) { - ret = put_compat_rusage(&ru, uru); + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); if (ret) - return ret; + return -EFAULT; } BUG_ON(info.si_code & __SI_MASK); @@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) { switch (_NSIG_WORDS) { case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); @@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) +void +sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +{ + switch (_NSIG_WORDS) { + case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; + case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; + case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; + case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + } +} + +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { compat_sigset_t s32; sigset_t s; @@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, sigset_from_compat(&s, &s32); if (uts) { - if (get_compat_timespec(&t, uts)) + if (compat_get_timespec(&t, uts)) return -EFAULT; } @@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } return ret; - -} - -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef __ARCH_WANT_COMPAT_SYS_TIME @@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - return sigsuspend(&newset); -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; @@ -1215,6 +1203,22 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec t; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); + set_fs(old_fs); + if (put_compat_timespec(&t, interval)) + return -EFAULT; + return ret; +} + /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd55508..65349f07b878 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,29 +1,41 @@ +/* + * Context tracking: Probe on high level context boundaries such as kernel + * and userspace. This includes syscalls and exceptions entry/exit. + * + * This is used by RCU to remove its dependency on the timer tick while a CPU + * runs in userspace. + * + * Started by Frederic Weisbecker: + * + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, + * Steven Rostedt, Peter Zijlstra for suggestions and improvements. + * + */ + #include <linux/context_tracking.h> +#include <linux/kvm_host.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/export.h> -struct context_tracking { - /* - * When active is false, hooks are not set to - * minimize overhead: TIF flags are cleared - * and calls to user_enter/exit are ignored. This - * may be further optimized using static keys. - */ - bool active; - enum { - IN_KERNEL = 0, - IN_USER, - } state; -}; - -static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_FORCE .active = true, #endif }; +/** + * user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. + * + * This function must be called right before we switch from the kernel + * to userspace, when it's guaranteed the remaining kernel instructions + * to execute won't use any RCU read side critical section because this + * function sets RCU in extended quiescent state. + */ void user_enter(void) { unsigned long flags; @@ -39,40 +51,90 @@ void user_enter(void) if (in_interrupt()) return; + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); local_irq_save(flags); if (__this_cpu_read(context_tracking.active) && __this_cpu_read(context_tracking.state) != IN_USER) { - __this_cpu_write(context_tracking.state, IN_USER); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ + vtime_user_enter(current); rcu_user_enter(); + __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); } + +/** + * user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. + * + * This function must be called after we entered the kernel from userspace + * before any use of RCU read side critical section. This potentially include + * any high level kernel code like syscalls, exceptions, signal handling, etc... + * + * This call supports re-entrancy. This way it can be called from any exception + * handler without needing to know if we came from userspace or not. + */ void user_exit(void) { unsigned long flags; - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ if (in_interrupt()) return; local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - __this_cpu_write(context_tracking.state, IN_KERNEL); + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ rcu_user_exit(); + vtime_user_exit(current); + __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); } +void guest_enter(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_enter(current); + else + __guest_enter(); +} +EXPORT_SYMBOL_GPL(guest_enter); + +void guest_exit(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_exit(current); + else + __guest_exit(); +} +EXPORT_SYMBOL_GPL(guest_exit); + + +/** + * context_tracking_task_switch - context switch the syscall callbacks + * @prev: the task that is being switched out + * @next: the task that is being switched in + * + * The context tracking uses the syscall slow path to implement its user-kernel + * boundaries probes on syscalls. This way it doesn't impact the syscall fast + * path on CPUs that don't do context tracking. + * + * But we need to clear the flag on the previous task because it may later + * migrate to some CPU that doesn't do the context tracking. As such the TIF + * flag may not be desired there. + */ void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..b5e4ab2d427e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu) static inline void check_for_tasks(int cpu) { struct task_struct *p; + cputime_t utime, stime; write_lock_irq(&tasklist_lock); for_each_process(p) { + task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) + (utime || stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, @@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Park the stopper thread */ + kthread_park(current); return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..4f9dfe43ecbd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,14 +61,6 @@ #include <linux/cgroup.h> /* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - -/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. @@ -95,18 +87,21 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - struct cpuset *parent; /* my parent */ - struct fmeter fmeter; /* memory_pressure filter */ + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + /* partition number for rebuild_sched_domains() */ int pn; /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; + struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ + struct cgroup *pcgrp = cs->css.cgroup->parent; + + if (pcgrp) + return cgroup_cs(pcgrp); + return NULL; +} + #ifdef CONFIG_NUMA static inline bool task_has_mempolicy(struct task_struct *task) { @@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) /* bits in struct cpuset flags field */ typedef enum { + CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -147,13 +152,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { - CPUSET_CPU_OFFLINE, - CPUSET_MEM_OFFLINE, -}; - /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags); +} + static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | + (1 << CS_MEM_EXCLUSIVE)), }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ + cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ + if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) + /* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex. The latter may nest inside the former. We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both mutexes to modify cpusets. If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets. It can perform various checks on the cpuset structure + * first, knowing nothing will change. It can also allocate memory while + * just holding cpuset_mutex. While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex @@ -232,6 +261,7 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* @@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); /* + * CPU / memory hotplug is handled asynchronously. + */ +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + +static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; + cs = parent_cs(cs); if (cs) cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else @@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) - cs = cs->parent; + cs = parent_cs(cs); if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); @@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { struct cgroup *cont; struct cpuset *c, *par; + int ret; + + rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } + ret = -EBUSY; + cpuset_for_each_child(c, cont, cur) + if (!is_cpuset_subset(c, trial)) + goto out; /* Remaining checks don't apply to root cpuset */ + ret = 0; if (cur == &top_cpuset) - return 0; + goto out; - par = cur->parent; + par = parent_cs(cur); /* We must be a subset of our parent cpuset */ + ret = -EACCES; if (!is_cpuset_subset(trial, par)) - return -EACCES; + goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); + ret = -EINVAL; + cpuset_for_each_child(c, cont, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; + goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; + goto out; } - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } + /* + * Cpusets with tasks - existing or newly being attached - can't + * have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && + (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed))) + goto out; - return 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SMP @@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) return; } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) { - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); + struct cpuset *cp; + struct cgroup *pos_cgrp; - if (cpumask_empty(cp->cpus_allowed)) + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); continue; + } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } } + rcu_read_unlock(); } /* @@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup *pos_cgrp; doms = NULL; dattr = NULL; @@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; csn = 0; - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; + if (!cpumask_empty(cp->cpus_allowed) && + !is_sched_load_balance(cp)) continue; - } - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + } + rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; @@ -725,25 +763,25 @@ done: /* * Rebuild scheduler domains. * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + lockdep_assert_held(&cpuset_mutex); get_online_cpus(); /* Generate domain masks and attrs */ - cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); @@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) put_online_cpus(); } #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { } @@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains, } #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ void rebuild_sched_domains(void) { - do_rebuild_sched_domains(NULL); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -808,7 +813,7 @@ void rebuild_sched_domains(void) * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). @@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, heap_free(&heap); if (is_load_balanced) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); return 0; } @@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cgroup_mutex, so current's cpuset won't change + * Call holding cpuset_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing @@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ + static nodemask_t newmems; /* protected by cpuset_mutex */ cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, &newmems); @@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound; * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ @@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); } return 0; @@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk, * @cs: the cpuset in which each task's spread flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs, &heap); @@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; int ret; + mutex_lock(&cpuset_mutex); + + ret = -ENOSPC; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; + goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { /* @@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * set_cpus_allowed_ptr() on all attached tasks before * cpus_allowed may be changed. */ + ret = -EINVAL; if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; + goto out_unlock; + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; } - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; + ret = 0; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; +} - return 0; +static void cpuset_cancel_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + mutex_lock(&cpuset_mutex); + cgroup_cs(cgrp)->attach_in_progress--; + mutex_unlock(&cpuset_mutex); } +/* + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; + static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + /* static bufs protected by cpuset_mutex */ + static nodemask_t cpuset_attach_nodemask_from; + static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); @@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + mutex_lock(&cpuset_mutex); + + /* prepare for attach */ + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cgroup_taskset_for_each(task, cgrp, tset) { /* * can_attach beforehand should guarantee that this doesn't @@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) &cpuset_attach_nodemask_to); mmput(mm); } + + cs->attach_in_progress--; + + /* + * We may have raced with CPU/memory hotunplug. Trigger hotplug + * propagation if @cs doesn't have any CPU or memory. It will move + * the newly added tasks to the nearest parent which can execute. + */ + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + schedule_cpuset_propagate_hotplug(cs); + + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -1469,12 +1510,13 @@ typedef enum { static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + int retval = -ENODEV; + + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + * + * Flushing cpuset_hotplug_work is enough to synchronize against + * hotplug hanlding; however, cpuset_attach() may schedule + * propagation work directly. Flush the workqueue too. + */ + flush_work(&cpuset_hotplug_work); + flush_workqueue(cpuset_propagate_hotplug_wq); - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; - goto out; + goto out_unlock; } switch (cft->private) { @@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); -out: - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1790,15 +1854,12 @@ static struct cftype files[] = { static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cgroup *parent_cg = cont->parent; - struct cgroup *tmp_cg; - struct cpuset *parent, *cs; + struct cpuset *cs; - if (!parent_cg) + if (!cont->parent) return &top_cpuset.css; - parent = cgroup_cs(parent_cg); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) return ERR_PTR(-ENOMEM); } - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); + INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; - cs->parent = parent; + return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *parent = parent_cs(cs); + struct cpuset *tmp_cs; + struct cgroup *pos_cg; + + if (!parent) + return 0; + + mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) - goto skip_clone; + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { - struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) - goto skip_clone; + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_cg, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + goto out_unlock; + } } + rcu_read_unlock(); mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); -skip_clone: - return &cs->css; +out_unlock: + mutex_unlock(&cpuset_mutex); + return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + + mutex_lock(&cpuset_mutex); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + clear_bit(CS_ONLINE, &cs->flags); + + mutex_unlock(&cpuset_mutex); } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .subsys_id = cpuset_subsys_id, .base_cftypes = files, @@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, { struct cgroup *new_cgroup = scan->data; + cgroup_lock(); cgroup_attach_task(new_cgroup, tsk); + cgroup_unlock(); } /** @@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, @@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - - /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ - parent = cs->parent; + parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) - parent = parent->parent; + parent = parent_cs(parent); move_member_tasks_to_cpuset(cs, parent); } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). +/** + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest + * + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly. If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources. */ -static struct cpuset *cpuset_next(struct list_head *queue) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work) { - struct cpuset *cp; - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + static cpumask_t off_cpus; + static nodemask_t off_mems, tmp_mems; + struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + bool is_empty; - if (list_empty(queue)) - return NULL; + mutex_lock(&cpuset_mutex); + + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); + nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - cp = list_first_entry(queue, struct cpuset, stack_list); - list_del(queue->next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, queue); + /* remove offline cpus from @cs */ + if (!cpumask_empty(&off_cpus)) { + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + update_tasks_cpumask(cs, NULL); + } + + /* remove offline mems from @cs */ + if (!nodes_empty(off_mems)) { + tmp_mems = cs->mems_allowed; + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + update_tasks_nodemask(cs, &tmp_mems, NULL); } - return cp; + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * If @cs became empty, move tasks to the nearest ancestor with + * execution resources. This is full cgroup operation which will + * also call back into cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + /* the following may free @cs, should be the last operation */ + css_put(&cs->css); } +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ + /* + * Pin @cs. The refcnt will be released when the work item + * finishes executing. + */ + if (!css_tryget(&cs->css)) + return; -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. + /* + * Queue @cs->hotplug_work. If already pending, lose the css ref. + * cpuset_propagate_hotplug_wq is ordered and propagation will + * happen in the order this function is called. + */ + if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) + css_put(&cs->css); +} + +/** + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly. The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets. * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. + * Non-root cpusets are only affected by offlining. If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants. * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Note that CPU offlining during suspend is ignored. We don't modify + * cpusets across suspend/resume cycles at all. */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_hotplug_workfn(struct work_struct *work) { - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - static nodemask_t oldmems; /* protected by cgroup_mutex */ + static cpumask_t new_cpus, tmp_cpus; + static nodemask_t new_mems, tmp_mems; + bool cpus_updated, mems_updated; + bool cpus_offlined, mems_offlined; - list_add_tail((struct list_head *)&root->stack_list, &queue); + mutex_lock(&cpuset_mutex); - switch (event) { - case CPUSET_CPU_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); + new_mems = node_states[N_MEMORY]; - /* Continue past cpusets with all cpus online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) - continue; + cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); + cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, + &new_cpus); - /* Remove offline cpus from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - mutex_unlock(&callback_mutex); + mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); + mems_offlined = !nodes_empty(tmp_mems); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_cpumask(cp, NULL); - } - break; + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + mutex_unlock(&callback_mutex); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } - case CPUSET_MEM_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { + tmp_mems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = new_mems; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + } - /* Continue past cpusets with all mems online */ - if (nodes_subset(cp->mems_allowed, - node_states[N_MEMORY])) - continue; + /* if cpus or mems went down, we need to propagate to descendants */ + if (cpus_offlined || mems_offlined) { + struct cpuset *cs; + struct cgroup *pos_cgrp; - oldmems = cp->mems_allowed; + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) + schedule_cpuset_propagate_hotplug(cs); + rcu_read_unlock(); + } - /* Remove offline mems from this cpuset. */ - mutex_lock(&callback_mutex); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_nodemask(cp, &oldmems, NULL); - } + /* wait for propagations to finish */ + flush_workqueue(cpuset_propagate_hotplug_wq); + + /* rebuild sched domains if cpus_allowed has changed */ + if (cpus_updated) { + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + mutex_lock(&cpuset_mutex); + ndoms = generate_sched_domains(&doms, &attr); + mutex_unlock(&cpuset_mutex); + + partition_sched_domains(ndoms, doms, attr); } } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). - */ void cpuset_update_active_cpus(bool cpu_online) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - - if (!cpu_online) - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); - - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_cpusets_upon_hotplug() will update it. - */ - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); - break; - default: - break; - } - cgroup_unlock(); - + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } #endif @@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void) hotplug_memory_notifier(cpuset_track_online_nodes, 10); - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); + cpuset_propagate_hotplug_wq = + alloc_ordered_workqueue("cpuset_hotplug", 0); + BUG_ON(!cpuset_propagate_hotplug_wq); } /** @@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) */ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) + cs = parent_cs(cs); return cs; } @@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) } /** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - -/** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page * @@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) dentry = task_cs(tsk)->css.cgroup->dentry; spin_lock(&cpuset_buffer_lock); - snprintf(cpuset_name, CPUSET_NAME_LEN, - dentry ? (const char *)dentry->d_name.name : "/"); + + if (!dentry) { + strcpy(cpuset_name, "/"); + } else { + spin_lock(&dentry->d_lock); + strlcpy(cpuset_name, (const char *)dentry->d_name.name, + CPUSET_NAME_LEN); + spin_unlock(&dentry->d_lock); + } + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", @@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *unused_v) @@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; - retval = -EINVAL; - cgroup_lock(); + rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + rcu_read_unlock(); if (retval < 0) - goto out_unlock; + goto out_put_task; seq_puts(m, buf); seq_putc(m, '\n'); -out_unlock: - cgroup_unlock(); +out_put_task: put_task_struct(tsk); out_free: kfree(buf); diff --git a/kernel/cred.c b/kernel/cred.c index 48cea3da6d05..e0573a43c7df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -30,17 +30,6 @@ static struct kmem_cache *cred_jar; /* - * The common credentials for the initial task's thread group - */ -#ifdef CONFIG_KEYS -static struct thread_group_cred init_tgcred = { - .usage = ATOMIC_INIT(2), - .tgid = 0, - .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), -}; -#endif - -/* * The initial credentials for the initial task */ struct cred init_cred = { @@ -65,9 +54,6 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, -#ifdef CONFIG_KEYS - .tgcred = &init_tgcred, -#endif }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) } /* - * Dispose of the shared task group credentials - */ -#ifdef CONFIG_KEYS -static void release_tgcred_rcu(struct rcu_head *rcu) -{ - struct thread_group_cred *tgcred = - container_of(rcu, struct thread_group_cred, rcu); - - BUG_ON(atomic_read(&tgcred->usage) != 0); - - key_put(tgcred->session_keyring); - key_put(tgcred->process_keyring); - kfree(tgcred); -} -#endif - -/* - * Release a set of thread group credentials. - */ -static void release_tgcred(struct cred *cred) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = cred->tgcred; - - if (atomic_dec_and_test(&tgcred->usage)) - call_rcu(&tgcred->rcu, release_tgcred_rcu); -#endif -} - -/* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) @@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) #endif security_cred_free(cred); + key_put(cred->session_keyring); + key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); - release_tgcred(cred); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); @@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; -#ifdef CONFIG_KEYS - new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); - if (!new->tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } - atomic_set(&new->tgcred->usage, 1); -#endif - atomic_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; @@ -308,9 +256,10 @@ struct cred *prepare_creds(void) get_user_ns(new->user_ns); #ifdef CONFIG_KEYS + key_get(new->session_keyring); + key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); - atomic_inc(&new->tgcred->usage); #endif #ifdef CONFIG_SECURITY @@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); */ struct cred *prepare_exec_creds(void) { - struct thread_group_cred *tgcred = NULL; struct cred *new; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return NULL; -#endif - new = prepare_creds(); - if (!new) { - kfree(tgcred); + if (!new) return new; - } #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; #endif return new; @@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif struct cred *new; int ret; @@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) install_thread_keyring_to_cred(new); } - /* we share the process and session keyrings between all the threads in - * a process - this is slightly icky as we violate COW credentials a - * bit */ + /* The process keyring is only shared between the threads in a process; + * anything outside of those threads doesn't inherit. + */ if (!(clone_flags & CLONE_THREAD)) { - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - ret = -ENOMEM; - goto error_put; - } - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = key_get(new->tgcred->session_keyring); - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; } #endif @@ -455,6 +372,31 @@ error_put: return ret; } +static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) +{ + const struct user_namespace *set_ns = set->user_ns; + const struct user_namespace *subset_ns = subset->user_ns; + + /* If the two credentials are in the same user namespace see if + * the capabilities of subset are a subset of set. + */ + if (set_ns == subset_ns) + return cap_issubset(subset->cap_permitted, set->cap_permitted); + + /* The credentials are in a different user namespaces + * therefore one is a subset of the other only if a set is an + * ancestor of subset and set->euid is owner of subset or one + * of subsets ancestors. + */ + for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { + if ((set_ns == subset_ns->parent) && + uid_eq(subset_ns->owner, set->euid)) + return true; + } + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -493,7 +435,7 @@ int commit_creds(struct cred *new) !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || - !cap_issubset(new->cap_permitted, old->cap_permitted)) { + !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; @@ -643,9 +585,6 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif const struct cred *old; struct cred *new; @@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } -#endif - kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = NULL; - new->tgcred = tgcred; - new->request_key_auth = NULL; + new->session_keyring = NULL; + new->process_keyring = NULL; new->thread_keyring = NULL; + new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc8..c26278fd4851 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -29,6 +29,7 @@ */ #include <linux/pid_namespace.h> #include <linux/clocksource.h> +#include <linux/serial_core.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/console.h> diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 3494c28a7e7a..2235967e78b0 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -72,6 +72,8 @@ extern int dbg_kdb_mode; #ifdef CONFIG_KGDB_KDB extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); +extern int kdb_common_init_state(struct kgdb_state *ks); +extern int kdb_common_deinit_state(void); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..19d9a578c753 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/kgdb.h> #include <linux/kdb.h> +#include <linux/serial_core.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks) len = len / 2; remcom_out_buffer[len++] = 0; + kdb_common_init_state(ks); kdb_parse(remcom_out_buffer); + kdb_common_deinit_state(); + strcpy(remcom_out_buffer, "OK"); } break; diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 8418c2f8ec5d..70a504601dc3 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv) /* * kdb_ss * - * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) - * commands. + * Process the 'ss' (Single Step) command. * * ss - * ssb * * Parameters: * argc Argument count @@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv) * Outputs: * None. * Returns: - * KDB_CMD_SS[B] for success, a kdb error if failure. + * KDB_CMD_SS for success, a kdb error if failure. * Locking: * None. * Remarks: * * Set the arch specific option to trigger a debug trap after the next * instruction. - * - * For 'ssb', set the trace flag in the debug trap handler - * after printing the current insn and return directly without - * invoking the kdb command processor, until a branch instruction - * is encountered. */ static int kdb_ss(int argc, const char **argv) { - int ssb = 0; - - ssb = (strcmp(argv[0], "ssb") == 0); if (argc != 0) return KDB_ARGCOUNT; /* * Set trace flag and go. */ KDB_STATE_SET(DOING_SS); - if (ssb) { - KDB_STATE_SET(DOING_SSB); - return KDB_CMD_SSB; - } return KDB_CMD_SS; } @@ -561,8 +547,6 @@ void __init kdb_initbptab(void) kdb_register_repeat("ss", kdb_ss, "", "Single Step", 1, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("ssb", kdb_ss, "", - "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index be7b33b73d30..328d18ef31e4 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx); static struct kgdb_state *kdb_ks; +int kdb_common_init_state(struct kgdb_state *ks) +{ + kdb_initial_cpu = atomic_read(&kgdb_active); + kdb_current_task = kgdb_info[ks->cpu].task; + kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + return 0; +} + +int kdb_common_deinit_state(void) +{ + kdb_initial_cpu = -1; + kdb_current_task = NULL; + kdb_current_regs = NULL; + return 0; +} + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks) } /* Set initial kdb state variables */ KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = atomic_read(&kgdb_active); - kdb_current_task = kgdb_info[ks->cpu].task; - kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + kdb_common_init_state(ks); /* Remove any breakpoints as needed by kdb and clear single step */ kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); - KDB_STATE_CLEAR(DOING_SSB); KDB_STATE_SET(PAGER); /* zero out any offline cpu data */ for_each_present_cpu(i) { @@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks) * Upon exit from the kdb main loop setup break points and restart * the system based on the requested continue state */ - kdb_initial_cpu = -1; - kdb_current_task = NULL; - kdb_current_regs = NULL; + kdb_common_deinit_state(); KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f3..00eb8f7fbf41 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = { }; #undef KDBMSG -static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); +static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* @@ -175,7 +175,7 @@ static char *__env[] = { (char *)0, }; -static const int __nenv = (sizeof(__env) / sizeof(char *)); +static const int __nenv = ARRAY_SIZE(__env); struct task_struct *kdb_curr_task(int cpu) { @@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv) } if (argc != 3) return KDB_ARGCOUNT; - defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), - GFP_KDB); - if (!defcmd_set) { - kdb_printf("Could not allocate new defcmd_set entry for %s\n", - argv[1]); - defcmd_set = save_defcmd_set; + if (in_dbg_master()) { + kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } + defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), + GFP_KDB); + if (!defcmd_set) + goto fail_defcmd; memcpy(defcmd_set, save_defcmd_set, defcmd_set_count * sizeof(*defcmd_set)); - kfree(save_defcmd_set); s = defcmd_set + defcmd_set_count; memset(s, 0, sizeof(*s)); s->usable = 1; s->name = kdb_strdup(argv[1], GFP_KDB); + if (!s->name) + goto fail_name; s->usage = kdb_strdup(argv[2], GFP_KDB); + if (!s->usage) + goto fail_usage; s->help = kdb_strdup(argv[3], GFP_KDB); + if (!s->help) + goto fail_help; if (s->usage[0] == '"') { - strcpy(s->usage, s->usage+1); + strcpy(s->usage, argv[2]+1); s->usage[strlen(s->usage)-1] = '\0'; } if (s->help[0] == '"') { - strcpy(s->help, s->help+1); + strcpy(s->help, argv[3]+1); s->help[strlen(s->help)-1] = '\0'; } ++defcmd_set_count; defcmd_in_progress = 1; + kfree(save_defcmd_set); return 0; +fail_help: + kfree(s->usage); +fail_usage: + kfree(s->name); +fail_name: + kfree(defcmd_set); +fail_defcmd: + kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]); + defcmd_set = save_defcmd_set; + return KDB_NOTIMP; } /* @@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p) * KDB_CMD_GO User typed 'go'. * KDB_CMD_CPU User switched to another cpu. * KDB_CMD_SS Single step. - * KDB_CMD_SSB Single step until branch. */ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_dbtrap_t db_result) @@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); break; - case KDB_DB_SSB: - /* - * In the midst of ssb command. Just return. - */ - KDB_DEBUG_STATE("kdb_local 3", reason); - return KDB_CMD_SSB; /* Continue with SSB command */ - - break; case KDB_DB_SS: break; case KDB_DB_SSBPT: @@ -1281,7 +1288,6 @@ do_full_getstr: if (diag == KDB_CMD_GO || diag == KDB_CMD_CPU || diag == KDB_CMD_SS - || diag == KDB_CMD_SSB || diag == KDB_CMD_KGDB) break; @@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, break; } - if (result == KDB_CMD_SSB) { - KDB_STATE_SET(DOING_SS); - KDB_STATE_SET(DOING_SSB); - break; - } - if (result == KDB_CMD_KGDB) { if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("Module Size modstruct Used by\n"); list_for_each_entry(mod, kdb_modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); @@ -2348,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv) return 0; } -/* - * kdb_ll - This function implements the 'll' command which follows a - * linked list and executes an arbitrary command for each - * element. - */ -static int kdb_ll(int argc, const char **argv) -{ - int diag = 0; - unsigned long addr; - long offset = 0; - unsigned long va; - unsigned long linkoffset; - int nextarg; - const char *command; - - if (argc != 3) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - - diag = kdbgetularg(argv[2], &linkoffset); - if (diag) - return diag; - - /* - * Using the starting address as - * the first element in the list, and assuming that - * the list ends with a null pointer. - */ - - va = addr; - command = kdb_strdup(argv[3], GFP_KDB); - if (!command) { - kdb_printf("%s: cannot duplicate command\n", __func__); - return 0; - } - /* Recursive use of kdb_parse, do not use argv after this point */ - argv = NULL; - - while (va) { - char buf[80]; - - if (KDB_FLAG(CMD_INTERRUPT)) - goto out; - - sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); - diag = kdb_parse(buf); - if (diag) - goto out; - - addr = va + linkoffset; - if (kdb_getword(&va, addr, sizeof(va))) - goto out; - } - -out: - kfree(command); - return diag; -} - static int kdb_kgdb(int argc, const char **argv) { return KDB_CMD_KGDB; @@ -2428,11 +2367,15 @@ static int kdb_help(int argc, const char **argv) kdb_printf("-----------------------------" "-----------------------------\n"); for_each_kdbcmd(kt, i) { - if (kt->cmd_name) - kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, - kt->cmd_usage, kt->cmd_help); + char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; + if (!kt->cmd_name) + continue; + if (strlen(kt->cmd_usage) > 20) + space = "\n "; + kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, + kt->cmd_usage, space, kt->cmd_help); } return 0; } @@ -2737,7 +2680,7 @@ int kdb_register_repeat(char *cmd, (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); kfree(kdb_commands); } - memset(new + kdb_max_commands, 0, + memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0, kdb_command_extend * sizeof(*new)); kdb_commands = new; kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; @@ -2841,15 +2784,13 @@ static void __init kdb_inittab(void) "Stack traceback", 1, KDB_REPEAT_NONE); kdb_register_repeat("btp", kdb_bt, "<pid>", "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", - "Display stack all processes", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); kdb_register_repeat("btc", kdb_bt, "", "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); kdb_register_repeat("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>", - "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); kdb_register_repeat("env", kdb_env, "", "Show environment variables", 0, KDB_REPEAT_NONE); kdb_register_repeat("set", kdb_set, "", diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 392ec6a25844..7afd3c8c41d5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -19,7 +19,6 @@ #define KDB_CMD_GO (-1001) #define KDB_CMD_CPU (-1002) #define KDB_CMD_SS (-1003) -#define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) /* Internal debug flags */ @@ -125,8 +124,6 @@ extern int kdb_state; * kdb control */ #define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ #define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ -#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, - * DOING_SS is also set */ #define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint * after one ss, independent of * DOING_SS */ @@ -191,7 +188,6 @@ extern void kdb_bp_remove(void); typedef enum { KDB_DB_BPT, /* Breakpoint */ KDB_DB_SS, /* Single-step trap */ - KDB_DB_SSB, /* Single step to branch */ KDB_DB_SSBPT, /* Single step over breakpoint */ KDB_DB_NOBPT /* Spurious breakpoint */ } kdb_dbtrap_t; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053aa..d473988c1d0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) unsigned long long t2, t3; unsigned long flags; struct timespec ts; + cputime_t utime, stime, stimescaled, utimescaled; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) goto done; tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); + task_cputime(tsk, &utime, &stime); + cputime_to_timespec(utime + stime, &ts); tmp += timespec_to_ns(&ts); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + cputime_to_timespec(utimescaled + stimescaled, &ts); tmp += timespec_to_ns(&ts); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/events/core.c b/kernel/events/core.c index f9ff5493171d..b0cd86501c30 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } /* + * Initialize event state based on the perf_event_attr::disabled. + */ +static inline void perf_event__state_init(struct perf_event *event) +{ + event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : + PERF_EVENT_STATE_INACTIVE; +} + +/* * Called at perf_event creation and when events are attached/detached from a * group. */ @@ -3682,7 +3691,7 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; @@ -5117,7 +5126,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; - struct hlist_node *node; struct hlist_head *head; rcu_read_lock(); @@ -5125,7 +5133,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (!head) goto end; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } @@ -5410,7 +5418,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, { struct perf_sample_data data; struct perf_event *event; - struct hlist_node *node; struct perf_raw_record raw = { .size = entry_size, @@ -5420,7 +5427,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr, 0); data.raw = &raw; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } @@ -5956,13 +5963,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type) pmu->name = name; if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; + type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); + if (type < 0) { + ret = type; goto free_pdc; } } @@ -6155,18 +6158,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (task) { event->attach_state = PERF_ATTACH_TASK; + + if (attr->type == PERF_TYPE_TRACEPOINT) + event->hw.tp_target = task; #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * hw_breakpoint is a bit difficult here.. */ - if (attr->type == PERF_TYPE_BREAKPOINT) + else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif } @@ -6179,8 +6185,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->overflow_handler = overflow_handler; event->overflow_handler_context = context; - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; + perf_event__state_init(event); pmu = NULL; @@ -6609,9 +6614,17 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&gctx->mutex); perf_remove_from_context(group_leader); + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling); + perf_event__state_init(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507ed..a64f8aeb5c1f 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); if (err_cpu == cpu) break; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb071..a567c8c7ef31 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ #include <linux/swap.h> /* try_to_free_swap */ @@ -41,58 +42,31 @@ #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; - -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ - -#define UPROBES_HASH_SZ 13 - /* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time. Probably a fine grained per inode count is better? */ +#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +#define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time. Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); - /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 2 +#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; + struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; - struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock(&uprobes_treelock); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - return u; } @@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; + init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - mutex_init(&uprobe->copy_mutex); + /* For now assume that the instruction need not be single-stepped */ + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else { - atomic_inc(&uprobe_events); } return uprobe; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ - struct uprobe_consumer *uc; - - if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) - return; - - down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } - up_read(&uprobe->consumer_rwsem); -} - -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); uc->next = uprobe->consumers; uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - - return uc->next; } /* @@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; - mutex_lock(&uprobe->copy_mutex); + /* TODO: move this into _register, until then we abuse this sem. */ + down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; @@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: - mutex_unlock(&uprobe->copy_mutex); + up_write(&uprobe->consumer_rwsem); + + return ret; +} + +static inline bool consumer_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + return !uc->filter || uc->filter(uc, ctx, mm); +} + +static bool filter_chain(struct uprobe *uprobe, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct uprobe_consumer *uc; + bool ret = false; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + ret = consumer_filter(uc, ctx, mm); + if (ret) + break; + } + up_read(&uprobe->consumer_rwsem); return ret; } @@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, bool first_uprobe; int ret; - /* - * If probe is being deleted, unregister thread could be done with - * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. Also we could be from fork or - * mremap path, where the probe might have already been inserted. - * Hence behave as if probe already existed. - */ - if (!uprobe->consumers) - return 0; - ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; @@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - /* can happen if uprobe_register() fails */ - if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return 0; - set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} /* * There could be threads that have already hit the breakpoint. They * will recheck the current insn and restart if find_uprobe() fails. @@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { + if (WARN_ON(!uprobe_is_active(uprobe))) + return; + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); + RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ iput(uprobe->inode); put_uprobe(uprobe); - atomic_dec(&uprobe_events); } struct map_info { @@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) return curr; } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { + bool is_register = !!new; struct map_info *info; int err = 0; @@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - else - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (is_register) { + /* consult only the "caller", new consumer. */ + if (consumer_filter(new, + UPROBE_FILTER_REGISTER, mm)) + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + if (!filter_chain(uprobe, + UPROBE_FILTER_UNREGISTER, mm)) + err |= remove_breakpoint(uprobe, mm, info->vaddr); + } unlock: up_write(&mm->mmap_sem); @@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) return err; } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - return register_for_each_vma(uprobe, true); + consumer_add(uprobe, uc); + return register_for_each_vma(uprobe, uc); } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - if (!register_for_each_vma(uprobe, false)) - delete_uprobe(uprobe); + int err; + + if (!consumer_del(uprobe, uc)) /* WARN? */ + return; + err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ + if (!uprobe->consumers && !err) + delete_uprobe(uprobe); } /* @@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; - if (!inode || !uc || uc->next) - return -EINVAL; - + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; - ret = 0; - mutex_lock(uprobes_hash(inode)); + retry: uprobe = alloc_uprobe(inode, offset); - - if (!uprobe) { - ret = -ENOMEM; - } else if (!consumer_add(uprobe, uc)) { - ret = __uprobe_register(uprobe); - if (ret) { - uprobe->consumers = NULL; - __uprobe_unregister(uprobe); - } else { - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } + if (!uprobe) + return -ENOMEM; + /* + * We can race with uprobe_unregister()->delete_uprobe(). + * Check uprobe_is_active() and retry if it is false. + */ + down_write(&uprobe->register_rwsem); + ret = -EAGAIN; + if (likely(uprobe_is_active(uprobe))) { + ret = __uprobe_register(uprobe, uc); + if (ret) + __uprobe_unregister(uprobe, uc); } + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + if (unlikely(ret == -EAGAIN)) + goto retry; + return ret; +} +EXPORT_SYMBOL_GPL(uprobe_register); + +/* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc, bool add) +{ + struct uprobe *uprobe; + struct uprobe_consumer *con; + int ret = -ENOENT; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return ret; + + down_write(&uprobe->register_rwsem); + for (con = uprobe->consumers; con && con != uc ; con = con->next) + ; + if (con) + ret = register_for_each_vma(uprobe, add ? uc : NULL); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); return ret; } @@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume { struct uprobe *uprobe; - if (!inode || !uc) - return; - uprobe = find_uprobe(inode, offset); if (!uprobe) return; - mutex_lock(uprobes_hash(inode)); + down_write(&uprobe->register_rwsem); + __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); - if (consumer_del(uprobe, uc)) { - if (!uprobe->consumers) { - __uprobe_unregister(uprobe); - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int err = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long vaddr; + loff_t offset; + + if (!valid_vma(vma, false) || + vma->vm_file->f_mapping->host != uprobe->inode) + continue; + + offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + if (uprobe->offset < offset || + uprobe->offset >= offset + vma->vm_end - vma->vm_start) + continue; + + vaddr = offset_to_vaddr(vma, uprobe->offset); + err |= remove_breakpoint(uprobe, mm, vaddr); } + up_read(&mm->mmap_sem); - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + return err; } static struct rb_node * @@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + if (no_uprobe_events() || !valid_vma(vma, true)) return 0; inode = vma->vm_file->f_mapping->host; @@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - + /* + * We can race with uprobe_unregister(), this uprobe can be already + * removed. But in this case filter_chain() must return false, all + * consumers have gone away. + */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ @@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct xol_area *area) { - struct mm_struct *mm; - int ret; - - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) - return -ENOMEM; - - ret = -EALREADY; - mm = current->mm; + struct mm_struct *mm = current->mm; + int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; ret = 0; - -fail: + fail: up_write(&mm->mmap_sem); - if (ret) - __free_page(area->page); return ret; } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ - struct xol_area *area; - - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ - - return area; -} - /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void) { + struct mm_struct *mm = current->mm; struct xol_area *area; + area = mm->uprobes_state.xol_area; + if (area) + goto ret; + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) - return NULL; + goto out; area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); - if (!area->bitmap) - goto fail; + goto free_area; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + goto free_bitmap; init_waitqueue_head(&area->wq); if (!xol_add_vma(area)) return area; -fail: + __free_page(area->page); + free_bitmap: kfree(area->bitmap); + free_area: kfree(area); - - return get_xol_area(current->mm); + out: + area = mm->uprobes_state.xol_area; + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return area; } /* @@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) } /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol. * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; unsigned long offset; + unsigned long xol_vaddr; void *vaddr; - area = get_xol_area(current->mm); - if (!area) { - area = xol_alloc_area(); - if (!area) - return 0; - } - current->utask->xol_vaddr = xol_take_insn_slot(area); + area = get_xol_area(); + if (!area) + return 0; - /* - * Initialize the slot if xol_vaddr points to valid - * instruction slot. - */ - if (unlikely(!current->utask->xol_vaddr)) + xol_vaddr = xol_take_insn_slot(area); + if (unlikely(!xol_vaddr)) return 0; - current->utask->vaddr = slot_addr; - offset = current->utask->xol_vaddr & ~PAGE_MASK; + /* Initialize the slot */ + offset = xol_vaddr & ~PAGE_MASK; vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); @@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot */ flush_dcache_page(area->page); - return current->utask->xol_vaddr; + return xol_vaddr; } /* @@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) return; slot_addr = tsk->utask->xol_vaddr; - - if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + if (unlikely(!slot_addr)) return; area = tsk->mm->uprobes_state.xol_area; @@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void) { - struct uprobe_task *utask; - - utask = kzalloc(sizeof *utask, GFP_KERNEL); - if (unlikely(!utask)) - return NULL; - - current->utask = utask; - return utask; + if (!current->utask) + current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + return current->utask; } /* Prepare to single-step probed instruction out of line. */ static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) - return 0; + struct uprobe_task *utask; + unsigned long xol_vaddr; + int err; + + utask = get_utask(); + if (!utask) + return -ENOMEM; + + xol_vaddr = xol_get_insn_slot(uprobe); + if (!xol_vaddr) + return -ENOMEM; + + utask->xol_vaddr = xol_vaddr; + utask->vaddr = bp_vaddr; + + err = arch_uprobe_pre_xol(&uprobe->arch, regs); + if (unlikely(err)) { + xol_free_insn_slot(current); + return err; + } - return -EFAULT; + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; + return 0; } /* @@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. + * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; @@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + int remove = UPROBE_HANDLER_REMOVE; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + int rc = uc->handler(uc, regs); + + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + remove &= rc; + } + + if (remove && uprobe->consumers) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + up_read(&uprobe->register_rwsem); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { - struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; int uninitialized_var(is_swbp); @@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; - - utask = current->utask; - if (!utask) { - utask = add_utask(); - /* Cannot allocate; re-execute the instruction. */ - if (!utask) - goto restart; - } + goto out; handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) { - utask->active_uprobe = uprobe; - utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } @@ -1609,10 +1649,8 @@ static int __init init_uprobes(void) { int i; - for (i = 0; i < UPROBES_HASH_SZ; i++) { - mutex_init(&uprobes_mutex[i]); + for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - } if (percpu_init_rwsem(&dup_mmap_sem)) return -ENOMEM; diff --git a/kernel/exit.c b/kernel/exit.c index 50d2e93c36ea..51e485ca9935 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -20,6 +20,7 @@ #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> @@ -31,7 +32,6 @@ #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> -#include <linux/freezer.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } @@ -97,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); + cputime_t utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -135,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -495,7 +485,7 @@ static void exit_mm(struct task_struct * tsk) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; - schedule(); + freezable_schedule(); } __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); @@ -845,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done @@ -1104,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) sig = p->signal; psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/fork.c b/kernel/fork.c index 3c31e874afad..8d932b1c9056 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); @@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; +#endif if (!mm_init(mm, tsk)) goto fail_nomem; @@ -1041,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1165,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); + /* + * If the new process will be in a different pid namespace + * don't allow the creation of threads. + */ + if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1224,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqlock_init(&p->vtime_seqlock); + p->vtime_snap = 0; + p->vtime_snap_whence = VTIME_SLEEPING; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif @@ -1435,8 +1450,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + if (is_child_reaper(pid)) { + ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); @@ -1470,8 +1487,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) @@ -1551,15 +1566,9 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; } /* @@ -1618,7 +1627,6 @@ long do_fork(unsigned long clone_flags, return nr; } -#ifdef CONFIG_GENERIC_KERNEL_THREAD /* * Create a kernel thread. */ @@ -1627,7 +1635,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } -#endif #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) @@ -1667,8 +1674,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, 0, - parent_tidptr, child_tidptr); + long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + asmlinkage_protect(5, ret, clone_flags, newsp, + parent_tidptr, child_tidptr, tls_val); + return ret; } #endif @@ -1721,7 +1730,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1788,19 +1798,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1814,11 +1845,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1826,10 +1861,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) exit_sem(current); } - if (new_nsproxy) { + if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); - new_nsproxy = NULL; - } task_lock(current); @@ -1851,11 +1884,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); - } - if (new_nsproxy) - put_nsproxy(new_nsproxy); + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } + } +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..f0090a993dab 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,7 @@ #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> +#include <linux/sched/rt.h> #include <asm/futex.h> @@ -225,7 +226,7 @@ static void drop_futex_key_refs(union futex_key *key) * Returns a negative error code or 0 * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..f9f44fd4d34d 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -11,6 +11,7 @@ #include <linux/nsproxy.h> #include <linux/futex.h> #include <linux/ptrace.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr) } } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, return 0; } -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; @@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; @@ -172,9 +171,9 @@ err_unlock: return ret; } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct compat_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..d4da55d1fb65 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..cc47812d3feb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,8 @@ #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> #include <linux/timer.h> #include <asm/uaccess.h> @@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) * and expiry check is done in the hrtimer_interrupt or in the softirq. */ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { - if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - if (wakeup) { - raw_spin_unlock(&base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - raw_spin_lock(&base->cpu_base->lock); - } else - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - - return 1; - } - - return 0; + return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) @@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { return 0; } @@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * * XXX send_remote_softirq() ? */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) - hrtimer_enqueue_reprogram(timer, new_base, wakeup); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) + && hrtimer_enqueue_reprogram(timer, new_base)) { + if (wakeup) { + /* + * We need to drop cpu_base->lock to avoid a + * lock ordering issue vs. rq->lock. + */ + raw_spin_unlock(&new_base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + local_irq_restore(flags); + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + } unlock_hrtimer_base(timer, &flags); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3aca9f29d30e..cbd97ce0b000 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data) EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, + struct msi_desc *entry) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->irq_data.msi_desc = entry; - if (entry) - entry->irq = irq; + if (entry && !irq_offset) + entry->irq = irq_base; irq_put_desc_unlock(desc, flags); return 0; } /** + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data + * + * Set the MSI descriptor entry for an irq + */ +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +{ + return irq_set_msi_desc_off(irq, 0, entry); +} + +/** * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 35c70c9e24d8..fa17855ca65a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/task_work.h> #include "internals.h" @@ -818,7 +819,7 @@ static void irq_thread_dtor(struct callback_head *unused) action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); @@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) out: irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(enable_percpu_irq); void disable_percpu_irq(unsigned int irq) { @@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq) irq_percpu_disable(desc, cpu); irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(disable_percpu_irq); /* * Internal function to unregister a percpu irqaction. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4bd4faa6323a..397db02209ed 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; + unsigned int irq = (int)(long)PDE(file_inode(file))->data; cpumask_var_t new_value; int err; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 611cd6003c45..7b5f012bde9d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) /* * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. + * first. */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || - (action->handler(irq, action->dev_id) == IRQ_HANDLED) || - !action->next) + (action->flags & __IRQF_TIMER)) goto out; /* Already running on another processor */ @@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) do { if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; + /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 1588e3b2871b..55fcce6065cf 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -12,37 +12,36 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/processor.h> -/* - * An entry can be in one of four states: - * - * free NULL, 0 -> {claimed} : free to be used - * claimed NULL, 3 -> {pending} : claimed to be enqueued - * pending next, 3 -> {busy} : queued, pending callback - * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - */ - -#define IRQ_WORK_PENDING 1UL -#define IRQ_WORK_BUSY 2UL -#define IRQ_WORK_FLAGS 3UL static DEFINE_PER_CPU(struct llist_head, irq_work_list); +static DEFINE_PER_CPU(int, irq_work_raised); /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, nflags; + unsigned long flags, oflags, nflags; + /* + * Start with our best wish as a premise but only trust any + * flag value after cmpxchg() result. + */ + flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - flags = work->flags; - if (flags & IRQ_WORK_PENDING) - return false; nflags = flags | IRQ_WORK_FLAGS; - if (cmpxchg(&work->flags, flags, nflags) == flags) + oflags = cmpxchg(&work->flags, flags, nflags); + if (oflags == flags) break; + if (oflags & IRQ_WORK_PENDING) + return false; + flags = oflags; cpu_relax(); } @@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void) } /* - * Queue the entry and raise the IPI if needed. + * Enqueue the irq_work @entry unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. */ -static void __irq_work_queue(struct irq_work *work) +void irq_work_queue(struct irq_work *work) { - bool empty; + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return; + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - /* The list was empty, raise self-interrupt to start processing. */ - if (empty) - arch_irq_work_raise(); + llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + + /* + * If the work is not "lazy" or the tick is stopped, raise the irq + * work interrupt (if supported by the arch), otherwise, just wait + * for the next tick. + */ + if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { + if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + arch_irq_work_raise(); + } preempt_enable(); } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Enqueue the irq_work @entry, returns true on success, failure when the - * @entry was already enqueued by someone else. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue(struct irq_work *work) +bool irq_work_needs_cpu(void) { - if (!irq_work_claim(work)) { - /* - * Already enqueued, can't do! - */ + struct llist_head *this_list; + + this_list = &__get_cpu_var(irq_work_list); + if (llist_empty(this_list)) return false; - } - __irq_work_queue(work); + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + return true; } -EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. - */ -void irq_work_run(void) +static void __irq_work_run(void) { + unsigned long flags; struct irq_work *work; struct llist_head *this_list; struct llist_node *llnode; + + /* + * Reset the "raised" state right before we check the list because + * an NMI may enqueue after we find the list empty from the runner. + */ + __this_cpu_write(irq_work_raised, 0); + barrier(); + this_list = &__get_cpu_var(irq_work_list); if (llist_empty(this_list)) return; - BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); llnode = llist_del_all(this_list); @@ -119,16 +130,31 @@ void irq_work_run(void) /* * Clear the PENDING bit, after this point the @work * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. */ - work->flags = IRQ_WORK_BUSY; + flags = work->flags & ~IRQ_WORK_PENDING; + xchg(&work->flags, flags); + work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); + (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + BUG_ON(!in_irq()); + __irq_work_run(); +} EXPORT_SYMBOL_GPL(irq_work_run); /* @@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +#ifdef CONFIG_HOTPLUG_CPU +static int irq_work_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_DYING: + /* Called from stop_machine */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + break; + __irq_work_run(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_notify; + +static __init int irq_work_init_cpu_notifier(void) +{ + cpu_notify.notifier_call = irq_work_cpu_notify; + cpu_notify.priority = 0; + register_cpu_notifier(&cpu_notify); + return 0; +} +device_initcall(irq_work_init_cpu_notifier); + +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 30b7b225306c..e30ac0fe61c3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -4,6 +4,7 @@ #include <linux/string.h> #include <linux/random.h> #include <linux/module.h> +#include <linux/ptrace.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/cache.h> diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..bddd3d7a74b6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel low", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -223,6 +229,8 @@ out: } +static void kimage_free_page_list(struct list_head *list); + static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments) @@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, if (result) goto out; - *rimage = image; - /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be @@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { printk(KERN_ERR "Could not allocate swap buffer\n"); - goto out; + goto out_free; } - result = 0; - out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kimage_free_page_list(&image->control_pages); + kfree(image); +out: return result; } @@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out; + goto out_free; } /* @@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kfree(image); +out: return result; } @@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; - if (hole_end > crashk_res.end) - break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; @@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char *cmdline, * That function is the entry point for command line parsing and should be * called from the arch-specific code. */ -int __init parse_crashkernel(char *cmdline, +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name) { char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; @@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char *cmdline, *crash_base = 0; /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); + p = strstr(p, name); while (p) { ck_cmdline = p; - p = strstr(p+1, "crashkernel="); + p = strstr(p+1, name); } if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); /* * if the commandline contains a ':', then that's the extended @@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char *cmdline, return 0; } +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel="); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_low="); +} static void update_vmcoreinfo_note(void) { @@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, _count); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/kernel/kfifo.c b/kernel/kfifo.c deleted file mode 100644 index 59dcf5b81d24..000000000000 --- a/kernel/kfifo.c +++ /dev/null @@ -1,609 +0,0 @@ -/* - * A generic kernel FIFO implementation - * - * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/log2.h> -#include <linux/uaccess.h> -#include <linux/kfifo.h> - -/* - * internal helper to calculate the unused elements in a fifo - */ -static inline unsigned int kfifo_unused(struct __kfifo *fifo) -{ - return (fifo->mask + 1) - (fifo->in - fifo->out); -} - -int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask) -{ - /* - * round down to the next power of 2, since our 'let the indices - * wrap' technique works only in this case. - */ - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - - if (size < 2) { - fifo->data = NULL; - fifo->mask = 0; - return -EINVAL; - } - - fifo->data = kmalloc(size * esize, gfp_mask); - - if (!fifo->data) { - fifo->mask = 0; - return -ENOMEM; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_alloc); - -void __kfifo_free(struct __kfifo *fifo) -{ - kfree(fifo->data); - fifo->in = 0; - fifo->out = 0; - fifo->esize = 0; - fifo->data = NULL; - fifo->mask = 0; -} -EXPORT_SYMBOL(__kfifo_free); - -int __kfifo_init(struct __kfifo *fifo, void *buffer, - unsigned int size, size_t esize) -{ - size /= esize; - - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - fifo->data = buffer; - - if (size < 2) { - fifo->mask = 0; - return -EINVAL; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_init); - -static void kfifo_copy_in(struct __kfifo *fifo, const void *src, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(fifo->data + off, src, l); - memcpy(fifo->data, src + l, len - l); - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_in(struct __kfifo *fifo, - const void *buf, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - kfifo_copy_in(fifo, buf, len, fifo->in); - fifo->in += len; - return len; -} -EXPORT_SYMBOL(__kfifo_in); - -static void kfifo_copy_out(struct __kfifo *fifo, void *dst, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(dst, fifo->data + off, l); - memcpy(dst + l, fifo->data, len - l); - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_out_peek(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - kfifo_copy_out(fifo, buf, len, fifo->out); - return len; -} -EXPORT_SYMBOL(__kfifo_out_peek); - -unsigned int __kfifo_out(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - len = __kfifo_out_peek(fifo, buf, len); - fifo->out += len; - return len; -} -EXPORT_SYMBOL(__kfifo_out); - -static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned int *copied) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned long ret; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_from_user(fifo->data + off, from, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_from_user(fifo->data, from + l, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->in += len; - return err; -} -EXPORT_SYMBOL(__kfifo_from_user); - -static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, - unsigned int len, unsigned int off, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_to_user(to, fifo->data + off, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_to_user(to + l, fifo->data, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_to_user(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->out += len; - return err; -} -EXPORT_SYMBOL(__kfifo_to_user); - -static int setup_sgl_buf(struct scatterlist *sgl, void *buf, - int nents, unsigned int len) -{ - int n; - unsigned int l; - unsigned int off; - struct page *page; - - if (!nents) - return 0; - - if (!len) - return 0; - - n = 0; - page = virt_to_page(buf); - off = offset_in_page(buf); - l = 0; - - while (len >= l + PAGE_SIZE - off) { - struct page *npage; - - l += PAGE_SIZE; - buf += PAGE_SIZE; - npage = virt_to_page(buf); - if (page_to_phys(page) != page_to_phys(npage) - l) { - sg_set_page(sgl, page, l - off, off); - sgl = sg_next(sgl); - if (++n == nents || sgl == NULL) - return n; - page = npage; - len -= l - off; - l = off = 0; - } - } - sg_set_page(sgl, page, len, off); - return n + 1; -} - -static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, - int nents, unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned int n; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - n = setup_sgl_buf(sgl, fifo->data + off, nents, l); - n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - - return n; -} - -unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->in); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare); - -unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->out); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare); - -unsigned int __kfifo_max_r(unsigned int len, size_t recsize) -{ - unsigned int max = (1 << (recsize << 3)) - 1; - - if (len > max) - return max; - return len; -} -EXPORT_SYMBOL(__kfifo_max_r); - -#define __KFIFO_PEEK(data, out, mask) \ - ((data)[(out) & (mask)]) -/* - * __kfifo_peek_n internal helper function for determinate the length of - * the next record in the fifo - */ -static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) -{ - unsigned int l; - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - l = __KFIFO_PEEK(data, fifo->out, mask); - - if (--recsize) - l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; - - return l; -} - -#define __KFIFO_POKE(data, in, mask, val) \ - ( \ - (data)[(in) & (mask)] = (unsigned char)(val) \ - ) - -/* - * __kfifo_poke_n internal helper function for storeing the length of - * the record into the fifo - */ -static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) -{ - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - __KFIFO_POKE(data, fifo->in, mask, n); - - if (recsize > 1) - __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); -} - -unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) -{ - return __kfifo_peek_n(fifo, recsize); -} -EXPORT_SYMBOL(__kfifo_len_r); - -unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, - unsigned int len, size_t recsize) -{ - if (len + recsize > kfifo_unused(fifo)) - return 0; - - __kfifo_poke_n(fifo, len, recsize); - - kfifo_copy_in(fifo, buf, len, fifo->in + recsize); - fifo->in += len + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_in_r); - -static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, - void *buf, unsigned int len, size_t recsize, unsigned int *n) -{ - *n = __kfifo_peek_n(fifo, recsize); - - if (len > *n) - len = *n; - - kfifo_copy_out(fifo, buf, len, fifo->out + recsize); - return len; -} - -unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - return kfifo_out_copy_r(fifo, buf, len, recsize, &n); -} -EXPORT_SYMBOL(__kfifo_out_peek_r); - -unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); - fifo->out += n + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_out_r); - -void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int n; - - n = __kfifo_peek_n(fifo, recsize); - fifo->out += n + recsize; -} -EXPORT_SYMBOL(__kfifo_skip_r); - -int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) { - *copied = 0; - return 0; - } - - __kfifo_poke_n(fifo, len, recsize); - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->in += len + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_from_user_r); - -int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - unsigned int n; - - if (fifo->in == fifo->out) { - *copied = 0; - return 0; - } - - n = __kfifo_peek_n(fifo, recsize); - if (len > n) - len = n; - - ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->out += n + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_to_user_r); - -unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); - -void __kfifo_dma_in_finish_r(struct __kfifo *fifo, - unsigned int len, size_t recsize) -{ - len = __kfifo_max_r(len, recsize); - __kfifo_poke_n(fifo, len, recsize); - fifo->in += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_in_finish_r); - -unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > fifo->in - fifo->out) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); - -void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int len; - - len = __kfifo_peek_n(fifo, recsize); - fifo->out += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_out_finish_r); diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e386831..56dd34976d7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -38,6 +38,7 @@ #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> +#include <linux/async.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* + * We don't allow synchronous module loading from async. Module + * init may invoke async_synchronize_full() which will end up + * waiting for this task which already is waiting for the module + * loading to complete, leading to a deadlock. + */ + WARN_ON_ONCE(wait && current_is_async()); + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -219,9 +228,9 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); if (!retval) return 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa409..e35be53f6613 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void) struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (p->addr == addr) return p; } @@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); -static DECLARE_COMPLETION(optimizer_comp); #define OPTIMIZE_DELAY 5 /* @@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) /* Start optimizer after OPTIMIZE_DELAY passed */ static __kprobes void kick_kprobe_optimizer(void) { - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ @@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); - else - /* Wake up all waiters */ - complete_all(&optimizer_comp); } /* Wait for completing optimization and unoptimization */ static __kprobes void wait_for_kprobe_optimizer(void) { - if (delayed_work_pending(&optimizing_work)) - wait_for_completion(&optimizer_comp); + mutex_lock(&kprobe_mutex); + + while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + mutex_unlock(&kprobe_mutex); + + /* this will also make optimizing_work execute immmediately */ + flush_delayed_work(&optimizing_work); + /* @optimizing_work might not have been queued yet, relax */ + cpu_relax(); + + mutex_lock(&kprobe_mutex); + } + + mutex_unlock(&kprobe_mutex); } /* Optimize kprobe if p is ready to be optimized */ @@ -792,7 +798,6 @@ out: static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void) kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } @@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void) static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void) kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } @@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); } -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) do {} while (0) #define disarm_kprobe_ftrace(p) do {} while (0) @@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; unsigned long hash, flags = 0; if (unlikely(!kprobes_initialized)) @@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { + hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; struct hlist_head *head; /* No race here */ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, pos, next, head, hlist) { + hlist_for_each_entry_safe(ri, next, head, hlist) { if (ri->rp == rp) ri->rp = NULL; } @@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } @@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, { struct module *mod = data; struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; int checkcore = (val == MODULE_STATE_GOING); @@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; @@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) head = &kprobe_table[i]; preempt_disable(); - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (kprobe_aggrprobe(p)) { @@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = { static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) arm_kprobe(p); } @@ -2258,7 +2259,6 @@ already_enabled: static void __kprobes disarm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) disarm_kprobe(p, false); } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350d..259db207b5d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + curr->lockdep_depth, MAX_LOCK_DEPTH); printk("turning off the locking correctness validator.\n"); + + lockdep_print_held_locks(current); + debug_show_all_locks(); dump_stack(); + return 0; } @@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) @@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 0; if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); return 1; } @@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: lockdep_init_map(lock, name, key, 0); @@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: if (hlock->instance == lock) @@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/kernel/modsign_certificate.S @@ -0,0 +1,19 @@ +/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ +#ifndef SYMBOL_PREFIX +#define ASM_SYMBOL(sym) sym +#else +#define PASTE2(x,y) x##y +#define PASTE(x,y) PASTE2(x,y) +#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) +#endif + +#define GLOBAL(name) \ + .globl ASM_SYMBOL(name); \ + ASM_SYMBOL(name): + + .section ".init.data","aw" + +GLOBAL(modsign_certificate_list) + .incbin "signing_key.x509" + .incbin "extra_certificates" +GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 767e559dfb10..2b6e69909c39 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -20,12 +20,6 @@ struct key *modsign_keyring; extern __initdata const u8 modsign_certificate_list[]; extern __initdata const u8 modsign_certificate_list_end[]; -asm(".section .init.data,\"aw\"\n" - SYMBOL_PREFIX "modsign_certificate_list:\n" - ".incbin \"signing_key.x509\"\n" - ".incbin \"extra_certificates\"\n" - SYMBOL_PREFIX "modsign_certificate_list_end:" - ); /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice @@ -40,18 +34,15 @@ static __init int module_verify_init(void) { pr_notice("Initialise module verification\n"); - modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + modsign_keyring = keyring_alloc(".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(modsign_keyring)) panic("Can't allocate module signing keyring\n"); - if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) - panic("Can't instantiate module signing keyring\n"); - return 0; } diff --git a/kernel/module.c b/kernel/module.c index 6e48c3a43599..0925c9a71975 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -21,6 +21,7 @@ #include <linux/ftrace_event.h> #include <linux/init.h> #include <linux/kallsyms.h> +#include <linux/file.h> #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernel.h> @@ -28,6 +29,7 @@ #include <linux/vmalloc.h> #include <linux/elf.h> #include <linux/proc_fs.h> +#include <linux/security.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/fcntl.h> @@ -59,6 +61,7 @@ #include <linux/pfn.h> #include <linux/bsearch.h> #include <linux/fips.h> +#include <uapi/linux/module.h> #include "module-internal.h" #define CREATE_TRACE_POINTS @@ -185,6 +188,7 @@ struct load_info { ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) { + BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); if (mod && mod->state == MODULE_STATE_COMING) return -EBUSY; if (try_module_get(mod)) @@ -193,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod) return -ENOENT; } -static inline void add_taint_module(struct module *mod, unsigned flag) +static inline void add_taint_module(struct module *mod, unsigned flag, + enum lockdep_ok lockdep_ok) { - add_taint(flag); + add_taint(flag, lockdep_ok); mod->taints |= (1U << flag); } @@ -340,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) return true; } @@ -372,9 +380,6 @@ static bool check_symbol(const struct symsearch *syms, printk(KERN_WARNING "Symbol %s is being used " "by a non-GPL module, which will not " "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); } } @@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) +static struct module *find_module_all(const char *name, + bool even_unformed) { struct module *mod; list_for_each_entry(mod, &modules, list) { + if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) + continue; if (strcmp(mod->name, name) == 0) return mod; } return NULL; } + +struct module *find_module(const char *name) +{ + return find_module_all(name, false); +} EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP @@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (!mod->percpu_size) continue; for_each_possible_cpu(cpu) { @@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - add_taint(TAINT_FORCED_RMMOD); + add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); return ret; } #else @@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, case MODULE_STATE_GOING: state = "going"; break; + default: + BUG(); } return sprintf(buffer, "%s\n", state); } @@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason) if (!test_taint(TAINT_FORCED_MODULE)) printk(KERN_WARNING "%s: %s: kernel tainted.\n", mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else return -ENOEXEC; @@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license) if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); } } @@ -2282,7 +2304,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size; + unsigned int i, nsrc, ndst, strtab_size = 0; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2293,9 +2315,6 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - /* strtab always starts with a nul, so offset 0 is the empty string. */ - strtab_size = 1; - /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || @@ -2337,7 +2356,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *s++ = 0; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { @@ -2378,7 +2396,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return size == 0 ? NULL : vmalloc_exec(size); + return vmalloc_exec(size); } static void *module_alloc_update_bounds(unsigned long size) @@ -2425,18 +2443,17 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, - const void *mod, unsigned long *_len) +static int module_sig_check(struct load_info *info) { int err = -ENOKEY; - unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - unsigned long len = *_len; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const void *mod = info->hdr; - if (len > markerlen && - memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + if (info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ - *_len -= markerlen; - err = mod_verify_sig(mod, _len); + info->len -= markerlen; + err = mod_verify_sig(mod, &info->len); } if (!err) { @@ -2454,59 +2471,114 @@ static int module_sig_check(struct load_info *info, return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, - void *mod, unsigned long *len) +static int module_sig_check(struct load_info *info) { return 0; } #endif /* !CONFIG_MODULE_SIG */ -/* Sets info->hdr, info->len and info->sig_ok. */ -static int copy_and_check(struct load_info *info, - const void __user *umod, unsigned long len, - const char __user *uargs) +/* Sanity checks against invalid binaries, wrong arch, weird elf version. */ +static int elf_header_check(struct load_info *info) +{ + if (info->len < sizeof(*(info->hdr))) + return -ENOEXEC; + + if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 + || info->hdr->e_type != ET_REL + || !elf_check_arch(info->hdr) + || info->hdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (info->hdr->e_shoff >= info->len + || (info->hdr->e_shnum * sizeof(Elf_Shdr) > + info->len - info->hdr->e_shoff)) + return -ENOEXEC; + + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) { int err; - Elf_Ehdr *hdr; - if (len < sizeof(*hdr)) + info->len = len; + if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; + err = security_kernel_module_from_file(NULL); + if (err) + return err; + /* Suck in entire file: we'll want most of it. */ - if ((hdr = vmalloc(len)) == NULL) + info->hdr = vmalloc(info->len); + if (!info->hdr) return -ENOMEM; - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; + if (copy_from_user(info->hdr, umod, info->len) != 0) { + vfree(info->hdr); + return -EFAULT; } - err = module_sig_check(info, hdr, &len); + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_fd(int fd, struct load_info *info) +{ + struct file *file; + int err; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + file = fget(fd); + if (!file) + return -ENOEXEC; + + err = security_kernel_module_from_file(file); if (err) - goto free_hdr; + goto out; + + err = vfs_getattr(&file->f_path, &stat); + if (err) + goto out; + + if (stat.size > INT_MAX) { + err = -EFBIG; + goto out; + } - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + err = -EINVAL; + goto out; } - if (hdr->e_shoff >= len || - hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { - err = -ENOEXEC; - goto free_hdr; + info->hdr = vmalloc(stat.size); + if (!info->hdr) { + err = -ENOMEM; + goto out; } - info->hdr = hdr; - info->len = len; - return 0; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(info->hdr); + err = bytes; + goto out; + } + if (bytes == 0) + break; + pos += bytes; + } + info->len = pos; -free_hdr: - vfree(hdr); +out: + fput(file); return err; } @@ -2515,7 +2587,7 @@ static void free_copy(struct load_info *info) vfree(info->hdr); } -static int rewrite_section_headers(struct load_info *info) +static int rewrite_section_headers(struct load_info *info, int flags) { unsigned int i; @@ -2543,7 +2615,10 @@ static int rewrite_section_headers(struct load_info *info) } /* Track but don't keep modinfo and version sections. */ - info->index.vers = find_sec(info, "__versions"); + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.info = find_sec(info, ".modinfo"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -2558,7 +2633,7 @@ static int rewrite_section_headers(struct load_info *info) * Return the temporary module pointer (we'll replace it with the final * one when we move the module sections around). */ -static struct module *setup_load_info(struct load_info *info) +static struct module *setup_load_info(struct load_info *info, int flags) { unsigned int i; int err; @@ -2569,7 +2644,7 @@ static struct module *setup_load_info(struct load_info *info) info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info); + err = rewrite_section_headers(info, flags); if (err) return ERR_PTR(err); @@ -2607,11 +2682,14 @@ static struct module *setup_load_info(struct load_info *info) return mod; } -static int check_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo(struct module *mod, struct load_info *info, int flags) { const char *modmagic = get_modinfo(info, "vermagic"); int err; + if (flags & MODULE_INIT_IGNORE_VERMAGIC) + modmagic = NULL; + /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); @@ -2624,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info) } if (!get_modinfo(info, "intree")) - add_taint_module(mod, TAINT_OOT_MODULE); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP); + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); @@ -2741,20 +2819,23 @@ static int move_module(struct module *mod, struct load_info *info) memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc_update_bounds(mod->init_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr && mod->init_size) { - module_free(mod, mod->module_core); - return -ENOMEM; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + if (mod->init_size) { + ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr) { + module_free(mod, mod->module_core); + return -ENOMEM; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + } else + mod->module_init = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -2790,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod) * using GPL-only symbols it needs. */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); /* lve claims to be GPL but upstream won't provide source */ if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) @@ -2847,18 +2930,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } -static struct module *layout_and_allocate(struct load_info *info) +static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; Elf_Shdr *pcpusec; int err; - mod = setup_load_info(info); + mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; - err = check_modinfo(mod, info); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -2938,70 +3021,255 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module(name); - ret = !mod || mod->state != MODULE_STATE_COMING; + mod = find_module_all(name, true); + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; } +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* This is where the real work happens */ +static int do_init_module(struct module *mod) +{ + int ret = 0; + + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; + } + if (ret > 0) { + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } + + /* Now it's a first class citizen! */ + mod->state = MODULE_STATE_LIVE; + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); + + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_ro_size = 0; + mod->init_text_size = 0; + mutex_unlock(&module_mutex); + wake_up_all(&module_wq); + + return 0; +} + +static int may_init_module(void) +{ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + return 0; +} + +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources. In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ + int err; + struct module *old; + + mod->state = MODULE_STATE_UNFORMED; + +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto out_unlocked; + goto again; + } + err = -EEXIST; + goto out; + } + list_add_rcu(&mod->list, &modules); + err = 0; + +out: + mutex_unlock(&module_mutex); +out_unlocked: + return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ + int err; + + mutex_lock(&module_mutex); + + /* Find duplicate symbols (must be called under lock). */ + err = verify_export_symbols(mod); + if (err < 0) + goto out; + + /* This relies on module_mutex for list integrity. */ + module_bug_finalize(info->hdr, info->sechdrs, mod); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + +out: + mutex_unlock(&module_mutex); + return err; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +static int load_module(struct load_info *info, const char __user *uargs, + int flags) { - struct load_info info = { NULL, }; - struct module *mod, *old; + struct module *mod; long err; - pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); + err = module_sig_check(info); + if (err) + goto free_copy; - /* Copy in the blobs from userspace, check they are vaguely sane. */ - err = copy_and_check(&info, umod, len, uargs); + err = elf_header_check(info); if (err) - return ERR_PTR(err); + goto free_copy; /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(&info); + mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { err = PTR_ERR(mod); goto free_copy; } + /* Reserve our place in the list. */ + err = add_unformed_module(mod); + if (err) + goto free_module; + #ifdef CONFIG_MODULE_SIG - mod->sig_ok = info.sig_ok; - if (!mod->sig_ok) - add_taint_module(mod, TAINT_FORCED_MODULE); + mod->sig_ok = info->sig_ok; + if (!mod->sig_ok) { + printk_once(KERN_NOTICE + "%s: module verification failed: signature and/or" + " required key missing - tainting kernel\n", + mod->name); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + } #endif /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) - goto free_module; + goto unlink_mod; /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, &info); + find_module_sections(mod, info); err = check_module_license_and_versions(mod); if (err) goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, &info); + setup_modinfo(mod, info); /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, &info); + err = simplify_symbols(mod, info); if (err < 0) goto free_modinfo; - err = apply_relocations(mod, &info); + err = apply_relocations(mod, info); if (err < 0) goto free_modinfo; - err = post_relocation(mod, &info); + err = post_relocation(mod, info); if (err < 0) goto free_modinfo; @@ -3014,72 +3282,39 @@ static struct module *load_module(void __user *umod, goto free_arch_cleanup; } - /* Mark state as coming so strong_try_module_get() ignores us. */ - mod->state = MODULE_STATE_COMING; - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. No one should access us, since - * strong_try_module_get() will fail. - * lockdep/oops can run asynchronous, so use the RCU list insertion - * function to insert in a way safe to concurrent readers. - * The mutex protects against concurrent writers. - */ -again: - mutex_lock(&module_mutex); - if ((old = find_module(mod->name)) != NULL) { - if (old->state == MODULE_STATE_COMING) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_arch_cleanup; - goto again; - } - err = -EEXIST; - goto unlock; - } - - /* This has to be done once we're sure module name is unique. */ - dynamic_debug_setup(info.debug, info.num_debug); - - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto ddebug; + dynamic_debug_setup(info->debug, info->num_debug); - module_bug_finalize(info.hdr, info.sechdrs, mod); - list_add_rcu(&mod->list, &modules); - mutex_unlock(&module_mutex); + /* Finally it's fully formed, ready to start executing. */ + err = complete_formation(mod, info); + if (err) + goto ddebug_cleanup; /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) - goto unlink; + goto bug_cleanup; /* Link in to syfs. */ - err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); + err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto unlink; + goto bug_cleanup; /* Get rid of temporary copy. */ - free_copy(&info); + free_copy(info); /* Done! */ trace_module_load(mod); - return mod; - unlink: + return do_init_module(mod); + + bug_cleanup: + /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); module_bug_cleanup(mod); - wake_up_all(&module_wq); - ddebug: - dynamic_debug_remove(info.debug); - unlock: mutex_unlock(&module_mutex); + ddebug_cleanup: + dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: @@ -3088,107 +3323,59 @@ again: free_modinfo(mod); free_unload: module_unload_free(mod); + unlink_mod: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + wake_up_all(&module_wq); + mutex_unlock(&module_mutex); free_module: - module_deallocate(mod, &info); + module_deallocate(mod, info); free_copy: - free_copy(&info); - return ERR_PTR(err); -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif + free_copy(info); + return err; } -/* This is where the real work happens */ SYSCALL_DEFINE3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs) { - struct module *mod; - int ret = 0; + int err; + struct load_info info = { }; - /* Must have permission */ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; + err = may_init_module(); + if (err) + return err; - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) - return PTR_ERR(mod); + pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); + err = copy_module_from_user(umod, len, &info); + if (err) + return err; - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); + return load_module(&info, uargs, 0); +} - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + int err; + struct load_info info = { }; - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } + err = may_init_module(); + if (err) + return err; - /* Now it's a first class citizen! */ - mod->state = MODULE_STATE_LIVE; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC)) + return -EINVAL; - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; -#endif - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); - wake_up_all(&module_wq); + err = copy_module_from_fd(fd, &info); + if (err) + return err; - return 0; + return load_module(&info, uargs, flags); } static inline int within(unsigned long addr, void *start, unsigned long size) @@ -3264,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { if (modname) @@ -3287,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3311,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3338,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -3380,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((ret = mod_find_symname(mod, name)) != 0) break; + } } preempt_enable(); return ret; @@ -3397,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, int ret; list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; for (i = 0; i < mod->num_symtab; i++) { ret = fn(data, mod->strtab + mod->symtab[i].st_name, mod, mod->symtab[i].st_value); @@ -3412,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf) { int bx = 0; + BUG_ON(mod->state == MODULE_STATE_UNFORMED); if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { @@ -3453,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -3513,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (mod->num_exentries == 0) continue; @@ -3561,10 +3768,13 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_core(addr, mod) || within_module_init(addr, mod)) return mod; + } return NULL; } EXPORT_SYMBOL_GPL(__module_address); @@ -3617,8 +3827,11 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; printk(" %s%s", mod->name, module_flags(mod, buf)); + } preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c9526..52f23011b6e0 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,6 +19,7 @@ */ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/interrupt.h> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 7e1c3de1ce45..afc0456f227a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) + struct task_struct *tsk, struct user_namespace *user_ns, + struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk); + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk); + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -122,6 +123,7 @@ out_ns: int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -151,7 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, tsk->fs); + new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -183,19 +185,21 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { + struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; @@ -241,20 +245,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); + ei = PROC_I(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; unsigned int next_nr, next_index; - struct padata_parallel_queue *queue, *next_queue; + struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) goto out; } - queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); - if (queue->cpu_index == next_queue->cpu_index) { + if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; } diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff97..7c57cc9eee2c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -259,26 +259,19 @@ unsigned long get_taint(void) return tainted_mask; } -void add_taint(unsigned flag) +/** + * add_taint: add a taint flag if not already set. + * @flag: one of the TAINT_* constants. + * @lockdep_ok: whether lock debugging is still OK. + * + * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for + * some notewortht-but-not-corrupting cases, it can be set to true. + */ +void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { - /* - * Can't trust the integrity of the kernel anymore. - * We don't call directly debug_locks_off() because the issue - * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. - */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } + if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) + printk(KERN_WARNING + "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller, print_modules(); dump_stack(); print_oops_end_marker(); - add_taint(taint); + /* Just a warning, don't kill lockdep. */ + add_taint(taint, LOCKDEP_STILL_OK); } void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) diff --git a/kernel/pid.c b/kernel/pid.c index fd996c1ed9f8..047dc6264638 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> +#include <linux/proc_fs.h> #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). @@ -269,8 +257,23 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; + hlist_del_rcu(&upid->pid_chain); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + schedule_work(&ns->proc_work); + break; + } + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + goto out_unlock; + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: return pid; +out_unlock: + spin_unlock_irq(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -327,12 +341,18 @@ out_free: goto out; } +void disable_pid_allocation(struct pid_namespace *ns) +{ + spin_lock_irq(&pidmap_lock); + ns->nr_hashed &= ~PIDNS_HASH_ADDING; + spin_unlock_irq(&pidmap_lock); +} + struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct hlist_node *elem; struct upid *pnr; - hlist_for_each_entry_rcu(pnr, elem, + hlist_for_each_entry_rcu(pnr, &pid_hash[pid_hashfn(nr, ns)], pid_chain) if (pnr->nr == nr && pnr->ns == ns) return container_of(pnr, struct pid, @@ -344,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -428,7 +448,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -483,7 +503,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -494,7 +514,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; @@ -558,6 +578,9 @@ void __init pidhash_init(void) void __init pidmap_init(void) { + /* Veryify no one has done anything silly */ + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); @@ -569,6 +592,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7b07cc0dfb75..c1c3dc1c6023 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -10,6 +10,7 @@ #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/err.h> #include <linux/acct.h> @@ -71,10 +72,17 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, + struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; @@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); + ns->user_ns = get_user_ns(user_ns); + ns->nr_hashed = PIDNS_HASH_ADDING; + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) + if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); + return create_pid_namespace(user_ns, old_ns); } static void free_pid_ns(struct kref *kref) @@ -170,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; + /* Don't allow any more processes into the pid namespace */ + disable_pid_allocation(pid_ns); + /* Ignore SIGCHLD causing any terminated children to autoreap */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; @@ -211,22 +226,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -239,9 +247,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !capable(CAP_SYS_ADMIN)) + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -250,7 +259,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = ¤t->nsproxy->pid_ns->last_pid; + tmp.data = &pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } @@ -299,6 +308,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } +static void *pidns_get(struct task_struct *task) +{ + struct pid_namespace *ns; + + rcu_read_lock(); + ns = get_pid_ns(task_active_pid_ns(task)); + rcu_read_unlock(); + + return ns; +} + +static void pidns_put(void *ns) +{ + put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *ancestor, *new = ns; + + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Only allow entering the current active pid namespace + * or a child of the current active pid namespace. + * + * This is required for fork to return a usable pid value and + * this maintains the property that processes and their + * children can not escape their current pid namespace. + */ + if (new->level < active->level) + return -EINVAL; + + ancestor = new; + while (ancestor->level > active->level) + ancestor = ancestor->parent; + if (ancestor != active) + return -EINVAL; + + put_pid_ns(nsproxy->pid_ns); + nsproxy->pid_ns = get_pid_ns(new); + return 0; +} + +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + +const struct proc_ns_operations pidns_operations = { + .name = "pid", + .type = CLONE_NEWPID, + .get = pidns_get, + .put = pidns_put, + .install = pidns_install, + .inum = pidns_inum, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index d73840271dce..8fd709c9bb58 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,6 +9,7 @@ #include <asm/uaccess.h> #include <linux/kernel_stat.h> #include <trace/events/timer.h> +#include <linux/random.h> /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -154,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer, static inline cputime_t prof_ticks(struct task_struct *p) { - return p->utime + p->stime; + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return utime + stime; } static inline cputime_t virt_ticks(struct task_struct *p) { - return p->utime; + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return utime; } static int @@ -470,16 +479,23 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + cputime_t utime, stime; + + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + utime, stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct signal_struct *const sig = tsk->signal; + cputime_t utime, stime; + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, + utime + sig->utime, stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -1223,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, + .utime = utime, + .stime = stime, .sum_exec_runtime = tsk->se.sum_exec_runtime }; @@ -1398,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, while (!signal_pending(current)) { if (timer.it.cpu.expires.sched == 0) { /* - * Our timer fired and was reset. + * Our timer fired and was reset, below + * deletion can not fail. */ + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return 0; } @@ -1417,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, it); + error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } spin_unlock_irq(&timer.it_lock); + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 69185ae6b701..6edbb2c55c22 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - retry: - if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { - error = -EAGAIN; - goto out; - } + + idr_preload(GFP_KERNEL); spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); + error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); spin_unlock_irq(&idr_lock); - if (error) { - if (error == -EAGAIN) - goto retry; + idr_preload_end(); + if (error < 0) { /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) */ - error = -EAGAIN; + if (error == -ENOSPC) + error = -EAGAIN; goto out; } + new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; @@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { @@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, err = kc->clock_adj(which_clock, &ktx); - if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; return err; diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e2..c6422ffeda9a 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend); void queue_up_suspend_work(void) { - if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + if (autosleep_state > PM_SUSPEND_ON) queue_work(autosleep_wq, &suspend_work); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de1..d77663bfedeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; + suspend_state_t state = PM_SUSPEND_MIN; const char * const *s; #endif char *p; @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -576,6 +600,9 @@ static struct attribute * g[] = { &pm_print_times_attr.attr, #endif #endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif NULL, }; diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6f..98088e0e71e8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -21,7 +21,7 @@ /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only) do_gettimeofday(&start); - end_time = jiffies + TIMEOUT; + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad6..587dddeebf15 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, "%s called for unknown object.", __func__)) return; - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..d4feda084a3a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,12 +30,38 @@ #include "power.h" const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = "freeze", [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", }; static const struct platform_suspend_ops *suspend_ops; +static bool need_suspend_ops(suspend_state_t state) +{ + return !!(state > PM_SUSPEND_FREEZE); +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { + if (state == PM_SUSPEND_FREEZE) + return true; /* - * All states need lowlevel support and need to be valid to the lowlevel + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel + * support and need to be valid to the lowlevel * implementation, no valid callback implies that none are valid. */ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); @@ -89,11 +118,11 @@ static int suspend_test(int level) * hibernation). Run suspend notifiers, allocate the "suspend" console and * freeze processes. */ -static int suspend_prepare(void) +static int suspend_prepare(suspend_state_t state) { int error; - if (!suspend_ops || !suspend_ops->enter) + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) return -EPERM; pm_prepare_console(); @@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (suspend_ops->prepare) { + if (need_suspend_ops(state) && suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) goto Platform_finish; @@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - if (suspend_ops->prepare_late) { + if (need_suspend_ops(state) && suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) goto Platform_wake; } + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + freeze_enter(); + goto Platform_wake; + } + if (suspend_test(TEST_PLATFORM)) goto Platform_wake; @@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - if (suspend_ops->wake) + if (need_suspend_ops(state) && suspend_ops->wake) suspend_ops->wake(); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (suspend_ops->finish) + if (need_suspend_ops(state) && suspend_ops->finish) suspend_ops->finish(); return error; @@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (!suspend_ops) + if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; trace_machine_suspend(state); - if (suspend_ops->begin) { + if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) goto Close; @@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup + } while (!error && !wakeup && need_suspend_ops(state) && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: @@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state) ftrace_start(); resume_console(); Close: - if (suspend_ops->end) + if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: - if (suspend_ops->recover) + if (need_suspend_ops(state) && suspend_ops->recover) suspend_ops->recover(); goto Resume_devices; } @@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); + error = suspend_prepare(state); if (error) goto Unlock; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac7..9b2a1d58558d 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) rtc_set_alarm(rtc, &alm); } -static int __init has_wakealarm(struct device *dev, void *name_ptr) +static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(const char **)name_ptr = dev_name(dev); return 1; } @@ -159,8 +158,8 @@ static int __init test_suspend(void) static char warn_no_rtc[] __initdata = KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - char *pony = NULL; struct rtc_device *rtc = NULL; + struct device *dev; /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) @@ -171,9 +170,9 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); goto done; diff --git a/kernel/printk.c b/kernel/printk.c index 22e070f3470a..0b31715f335a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -42,6 +42,7 @@ #include <linux/notifier.h> #include <linux/rculist.h> #include <linux/poll.h> +#include <linux/irq_work.h> #include <asm/uaccess.h> @@ -747,6 +748,21 @@ void __init setup_log_buf(int early) free, (free * 100) / __LOG_BUF_LEN); } +static bool __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -770,13 +786,15 @@ static int __init boot_delay_setup(char *str) } __setup("boot_delay=", boot_delay_setup); -static void boot_delay_msec(void) +static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + || (level >= console_loglevel && !ignore_loglevel)) { return; + } k = (unsigned long long)loops_per_msec * boot_delay; @@ -795,7 +813,7 @@ static void boot_delay_msec(void) } } #else -static inline void boot_delay_msec(void) +static inline void boot_delay_msec(int level) { } #endif @@ -853,10 +871,11 @@ static size_t print_time(u64 ts, char *buf) if (!printk_time) return 0; + rem_nsec = do_div(ts, 1000000000); + if (!buf) - return 15; + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); - rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); } @@ -1238,21 +1257,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1498,7 +1502,7 @@ asmlinkage int vprintk_emit(int facility, int level, int this_cpu; int printed_len = 0; - boot_delay_msec(); + boot_delay_msec(level); printk_delay(); /* This stops the holder of console_sem just where we want him */ @@ -1964,30 +1968,32 @@ int is_console_locked(void) static DEFINE_PER_CPU(int, printk_pending); static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); -void printk_tick(void) +static void wake_up_klogd_work_func(struct irq_work *irq_work) { - if (__this_cpu_read(printk_pending)) { - int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); } -} -int printk_needs_cpu(int cpu) -{ - if (cpu_is_offline(cpu)) - printk_tick(); - return __this_cpu_read(printk_pending); + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + void wake_up_klogd(void) { - if (waitqueue_active(&log_wait)) + preempt_disable(); + if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); } static void console_cont_flush(char *text, size_t size) @@ -2468,6 +2474,7 @@ int printk_sched(const char *fmt, ...) va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); local_irq_restore(flags); return r; diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42f..dc3384ee874e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -37,9 +37,6 @@ struct profile_hit { #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - #ifdef CONFIG_SMP /* * Each cpu has a pair of open-addressed hashtables for pending @@ -436,8 +414,6 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); if (!user_mode(regs) && prof_cpu_mask != NULL && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f5e55dda955..acbd28424d81 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child) * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); + ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ + bool ret = false; + + /* Lockless, nobody but us can set this flag */ + if (task->jobctl & JOBCTL_LISTENING) + return ret; + + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !__fatal_signal_pending(task)) { + task->state = __TASK_TRACED; + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ + if (task->state != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); + + spin_lock_irq(&task->sighand->siglock); + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + spin_unlock_irq(&task->sighand->siglock); +} + /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for @@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child) * RETURNS: * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, bool ignore_state) +static int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { + if (child->ptrace && child->parent == current) { + WARN_ON(child->state == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) + if (ignore_state || ptrace_freeze_traced(child)) ret = 0; - spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + if (!ret && !ignore_state) { + if (!wait_task_inactive(child, __TASK_TRACED)) { + /* + * This can only happen if may_ptrace_stop() fails and + * ptrace_stop() changes ->state back to TASK_RUNNING, + * so we should not worry about leaking __TASK_TRACED. + */ + WARN_ON(child->state == __TASK_TRACED); + ret = -ESRCH; + } + } - /* All systems go.. */ return ret; } @@ -215,8 +254,12 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + rcu_read_lock(); + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -280,8 +323,10 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; + rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); @@ -311,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request, */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); + signal_wake_up_state(task, __TASK_STOPPED); spin_unlock(&task->sighand->siglock); @@ -457,6 +502,9 @@ void exit_ptrace(struct task_struct *tracer) return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (unlikely(p->ptrace & PT_EXITKILL)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, p); + if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } @@ -664,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, kiov->iov_len, kiov->iov_base); } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code. We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view); #endif int ptrace_request(struct task_struct *child, long request, @@ -728,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request, * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; @@ -754,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request, * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); + ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); @@ -891,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); @@ -1030,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); - if (!ret) + if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); + } out_put_task_struct: put_task_struct(child); diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2b..7f8e7590e3e5 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) extern int rcu_expedited; +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b44..48ab70384a4c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, unsigned long c) { - trace_rcu_torture_read(rcutorturename, rhp); + trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); } EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) #endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2a..a0714a51b6d7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -#include "rcutiny_plugin.h" - static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; +#include "rcutiny_plugin.h" + /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * interrupts don't count, we must be running at the first interrupt * level. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return rcu_dynticks_nesting <= 1; } @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { + reset_cpu_stall_ticks(rcp); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { + check_cpu_stalls(); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309b..8a233002faeb 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -33,6 +33,9 @@ struct rcu_ctrlblk { struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ + RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ + RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ RCU_TRACE(char *name); /* Name of RCU type. */ }; @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stall_preempt(void); + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall_preempt()); +} + #ifdef CONFIG_TINY_PREEMPT_RCU #include <linux/delay.h> @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); + reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall_preempt(void) +{ +#ifdef CONFIG_TINY_PREEMPT_RCU + check_cpu_stall(&rcu_preempt_ctrlblk.rcb); +#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +} + #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85fd..e1f3a8c96724 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -46,6 +46,7 @@ #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/trace_clock.h> #include <asm/byteorder.h> MODULE_LICENSE("GPL"); @@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_uninterruptible(1); + schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) @@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void) return; if (atomic_xchg(&beenhere, 1) != 0) return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); ftrace_dump(DUMP_ALL); } @@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused) { int idx; int completed; + int completed_end; static DEFINE_RCU_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + unsigned long long ts; idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused) cur_ops->readunlock(idx); return; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1094,11 +1114,13 @@ static int rcu_torture_reader(void *arg) { int completed; + int completed_end; int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; + unsigned long long ts; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); @@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg) } idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg) schedule_timeout_interruptible(HZ); continue; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); @@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed_ptr(reader_tasks[i], shuffle_tmp_mask); } - if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], shuffle_tmp_mask); } - if (writer_task) set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; @@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_wq = kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), GFP_KERNEL); - if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..5b8ad827fd86 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_sched() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -217,12 +217,6 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); - static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; @@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* - * Does the current CPU require a yet-as-unscheduled grace period? + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry. */ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - struct rcu_head **ntp; + int i; - ntp = rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)]; - return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && - !rcu_gp_in_progress(rsp); + if (rcu_gp_in_progress(rsp)) + return 0; /* No, a grace period is already in progress. */ + if (!rdp->nxttail[RCU_NEXT_TAIL]) + return 0; /* No, this is a no-CBs (or offline) CPU. */ + if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + return 1; /* Yes, this CPU has newly registered callbacks. */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rdp->nxttail[i - 1] != rdp->nxttail[i] && + ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + rdp->nxtcompleted[i])) + return 1; /* Yes, CBs for future grace period. */ + return 0; /* No grace period needed. */ } /* @@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, 0); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); * interrupt from idle, return true. The caller must have at least * disabled preemption. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } @@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static int jiffies_till_stall_check(void) -{ - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; - till_stall_check = 3; - } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); + rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } /* @@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; + rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) rsp->jiffies_stall = jiffies + - 3 * jiffies_till_stall_check() + 3; + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ @@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) } } -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - /** * rcu_cpu_stall_reset - prevent further stall warnings in current grace period * @@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp) } /* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period. This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + /* + * If RCU is idle, we just wait for the next grace period. + * But we can only be sure that RCU is idle if we are looking + * at the root rcu_node structure -- otherwise, a new grace + * period might have started, but just not yet gotten around + * to initializing the current non-root rcu_node structure. + */ + if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) + return rnp->completed + 1; + + /* + * Otherwise, wait for a possible partial grace period and + * then the subsequent full grace period. + */ + return rnp->completed + 2; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned. Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure. This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + int i; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Starting from the sublist containing the callbacks most + * recently assigned a ->completed number and working down, find the + * first sublist that is not assignable to an upcoming grace period. + * Such a sublist has something in it (first two tests) and has + * a ->completed number assigned that will complete sooner than + * the ->completed number for newly arrived callbacks (last test). + * + * The key point is that any later sublist can be assigned the + * same ->completed number as the newly arrived callbacks, which + * means that the callbacks in any of these later sublist can be + * grouped into a single sublist, whether or not they have already + * been assigned a ->completed number. + */ + c = rcu_cbs_completed(rsp, rnp); + for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] != rdp->nxttail[i - 1] && + !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) + break; + + /* + * If there are no sublist for unassigned callbacks, leave. + * At the same time, advance "i" one sublist, so that "i" will + * index into the sublist where all the remaining callbacks should + * be grouped into. + */ + if (++i >= RCU_NEXT_TAIL) + return; + + /* + * Assign all subsequent callbacks' ->completed number to the next + * full grace period and group them all in the sublist initially + * indexed by "i". + */ + for (; i <= RCU_NEXT_TAIL; i++) { + rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtcompleted[i] = c; + } + + /* Trace depending on how much we were able to accelerate. */ + if (!*rdp->nxttail[RCU_WAIT_TAIL]) + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + else + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist. This function is idempotent, so it does not hurt to + * invoke it repeatedly. As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i, j; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Find all callbacks whose ->completed numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) + break; + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; + } + /* Clean up any sublist tail pointers that were misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + + /* Copy down callbacks to fill in empty sublists. */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) + break; + rdp->nxttail[j] = rdp->nxttail[i]; + rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; + } + + /* Classify any remaining callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); +} + +/* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp * belongs. In addition, the corresponding leaf rcu_node structure's @@ -1080,12 +1190,15 @@ static void __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Did another grace period end? */ - if (rdp->completed != rnp->completed) { + if (rdp->completed == rnp->completed) { - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + /* No, so just accelerate recent callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + + } else { + + /* Advance callbacks. */ + rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* * Because there is no grace period in progress right now, * any callbacks we have up to this point will be satisfied - * by the next grace period. So promote all callbacks to be - * handled after the end of the next grace period. If the - * CPU is not yet aware of the end of the previous grace period, - * we need to allow for the callback advancement that will - * occur when it does become aware. Deadlock prevents us from - * making it aware at this point: We cannot acquire a leaf - * rcu_node ->lock while holding the root rcu_node ->lock. + * by the next grace period. So this is a good place to + * assign a grace period number to recently posted callbacks. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - if (rdp->completed == rsp->completed) - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ @@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } @@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) long bl, count, count_lazy; int i; - /* If no callbacks are ready, just return.*/ + /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), @@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * Advance callbacks in response to end of earlier grace - * period that some other CPU ended. - */ + /* Handle the end of a grace period that some other CPU ended. */ rcu_process_gp_end(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); /* Does this CPU require a not-yet-started grace period? */ + local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ rcu_start_gp(rsp, flags); /* releases above lock */ + } else { + local_irq_restore(flags); } /* If there are callbacks ready, invoke them. */ @@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -#ifdef CONFIG_RCU_USER_QS - WARN_ON_ONCE(rdp->dynticks->in_user); -#endif rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + /* Silence gcc 4.8 warning about array index out of range. */ + if (rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls overflow"); + /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) @@ -3074,7 +3181,6 @@ void __init rcu_init(void) cpu_notifier(rcu_cpu_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - check_cpu_stall_init(); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..c896b5045d9d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,10 +102,6 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -#ifdef CONFIG_RCU_USER_QS - bool ignore_user_qs; /* Treat userspace as extended QS or not */ - bool in_user; /* Is the CPU in userland from RCU POV? */ -#endif }; /* RCU's kthread states for tracing. */ @@ -282,6 +278,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; + unsigned long nxtcompleted[RCU_NEXT_SIZE]; + /* grace periods for sublists. */ long qlen_lazy; /* # of lazy queued callbacks */ long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; @@ -343,11 +341,6 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f6e5ec2932b4..c1cc7e17ff9d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -40,8 +40,7 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ -static bool rcu_nocb_poll; /* Offload kthread are to poll. */ -module_param(rcu_nocb_poll, bool, 0444); +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static char __initdata nocb_buf[NR_CPUS * 5]; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str) } __setup("rcu_nocbs=", rcu_nocb_setup); +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = 1; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + /* Is the specified CPU a no-CPUs CPU? */ static bool is_nocb_cpu(int cpu) { @@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg) for (;;) { /* If not polling, wait for next batch of callbacks. */ if (!rcu_nocb_poll) - wait_event(rdp->nocb_wq, rdp->nocb_head); + wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); list = ACCESS_ONCE(rdp->nocb_head); if (!list) { schedule_timeout_interruptible(1); + flush_signals(current); continue; } diff --git a/kernel/relay.c b/kernel/relay.c index e8cd2027abbd..01ab081ac53a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_lock(&file_inode(filp)->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&file_inode(filp)->i_mutex); return desc->written; } diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 3920d593e63c..ff55247e7049 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) val = counter->usage; counter->usage -= val; + return counter->usage; } -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; + u64 ret = 0; local_irq_save(flags); for (c = counter; c != top; c = c->parent) { + u64 r; spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); + r = res_counter_uncharge_locked(c, val); + if (c == counter) + ret = r; spin_unlock(&c->lock); } local_irq_restore(flags); + return ret; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) { - res_counter_uncharge_until(counter, NULL, val); + return res_counter_uncharge_until(counter, NULL, val); } static inline unsigned long long * diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c8..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -17,6 +17,7 @@ * See rt.c in preempt-rt for proper credits and further information */ #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/spinlock.h> diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec49475460..7890b10084a7 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -10,6 +10,7 @@ #include <linux/kthread.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/freezer.h> diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c993..1e09308bf2a1 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/timer.h> #include "rtmutex_common.h" diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d8..b3c6c3fcd847 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; + sched_online_group(tg, &root_task_group); + kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 01edad9b5d71..7f12624a393c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,7 +83,7 @@ #endif #include "sched.h" -#include "../workqueue_sched.h" +#include "../workqueue_internal.h" #include "../smpboot.h" #define CREATE_TRACE_POINTS @@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1124,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; - /* Look for allowed, online CPU in same node. */ - for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; + /* + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. + */ + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } } for (;;) { @@ -1515,7 +1533,8 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1560,7 +1579,40 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ +} + +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; } +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ /* * fork()/clone()-time setup: @@ -1700,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } @@ -1711,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_out(notifier, next); } @@ -1927,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1943,23 +1992,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; @@ -2744,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); - add_taint(TAINT_WARN); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* @@ -3226,7 +3258,8 @@ void complete_all(struct completion *x) EXPORT_SYMBOL(complete_all); static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3239,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) } __set_current_state(state); spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); + timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); @@ -3250,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) return timeout ?: 1; } -static long __sched -wait_for_common(struct completion *x, long timeout, int state) +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); + timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion @@ -3297,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) EXPORT_SYMBOL(wait_for_completion_timeout); /** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion * @@ -4056,8 +4135,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) @@ -4326,7 +4411,7 @@ bool __sched yield_to(struct task_struct *p, bool preempt) struct task_struct *curr = current; struct rq *rq, *p_rq; unsigned long flags; - bool yielded = 0; + int yielded = 0; local_irq_save(flags); rq = this_rq(); @@ -4632,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif @@ -7125,7 +7211,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7137,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7146,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7164,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ unsigned long flags; int i; @@ -7175,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7473,6 +7566,25 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -7529,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7536,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) sched_destroy_group(tg); } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -7891,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53ee..1095e878a46f 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,8 @@ */ #include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf79..ed12cbb135f4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ #include <linux/tsacct_kern.h> #include <linux/kernel_stat.h> #include <linux/static_key.h> +#include <linux/context_tracking.h> #include "sched.h" @@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for user time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; struct task_struct *t; times->utime = sig->utime; @@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime += t->utime; - times->stime += t->stime; + task_cputime(tsk, &utime, &stime); + times->utime += utime; + times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); } -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (vtime_accounting_enabled()) + return; + if (sched_clock_irqtime) { irqtime_account_process_tick(p, user_tick, rq); return; @@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(jiffies_to_cputime(ticks)); } - -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Use precise platform statistics if available: @@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime *st = cputime.stime; } -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - #ifndef __ARCH_HAS_VTIME_TASK_SWITCH void vtime_task_switch(struct task_struct *prev) { + if (!vtime_accounting_enabled()) + return; + if (is_idle_task(prev)) vtime_account_idle(prev); else vtime_account_system(prev); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE vtime_account_user(prev); +#endif arch_vtime_task_switch(prev); } #endif @@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (in_interrupt() || !is_idle_task(tsk)) - vtime_account_system(tsk); - else - vtime_account_idle(tsk); + if (!vtime_accounting_enabled()) + return; + + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) { u64 temp = (__force u64) rtime; - temp *= (__force u64) utime; + temp *= (__force u64) stime; if (sizeof(cputime_t) == 4) temp = div_u64(temp, (__force u32) total); @@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime, total; + cputime_t rtime, stime, total; - utime = curr->utime; - total = utime + curr->stime; + stime = curr->stime; + total = stime + curr->utime; /* * Tick based cputime accounting depend on random scheduling @@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr, rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) - utime = scale_utime(utime, rtime, total); + stime = scale_stime(stime, rtime, total); else - utime = rtime; + stime = rtime; /* * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. */ - prev->utime = max(prev->utime, utime); - prev->stime = max(prev->stime, rtime - prev->utime); + prev->stime = max(prev->stime, stime); + prev->utime = max(prev->utime, rtime - prev->stime); *ut = prev->utime; *st = prev->stime; @@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr, void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { - .utime = p->utime, - .stime = p->stime, .sum_exec_runtime = p->se.sum_exec_runtime, }; + task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } @@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = local_clock(); + if (clock < tsk->vtime_snap) + return 0; + + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) +{ + unsigned long long delta = vtime_delta(tsk); + + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; + + /* CHECKME: always safe to convert nsecs to cputime? */ + return nsecs_to_cputime(delta); +} + +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_system(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_irq_exit(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_cpu; + + if (!vtime_accounting_enabled()) + return; + + delta_cpu = get_vtime_delta(tsk); + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_SYS; + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_idle(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_idle_time(delta_cpu); +} + +bool vtime_accounting_enabled(void) +{ + return context_tracking_active(); +} + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock(); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock(); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin(&t->vtime_seqlock); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry(&t->vtime_seqlock, seq)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin(&t->vtime_seqlock); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry(&t->vtime_seqlock, seq)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2cd3c1b4e582..75024a673520 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } cgroup_path(tg->css.cgroup, group_path, PATH_MAX); return group_path; } @@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", - atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", + (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", @@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +323,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} - for_each_online_cpu(cpu) - print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); - SEQ_printf(m, "\n"); + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59e072b2db97..7a33e5986fc5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,9 @@ #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h> #include <trace/events/sched.h> @@ -774,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms + */ +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static void task_numa_placement(struct task_struct *p) +{ + int seq; + + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages, bool migrated) +{ + struct task_struct *p = current; + + if (!sched_feat_numa(NUMA)) + return; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + /* + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes + */ + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); + + task_numa_placement(p); +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long start, end; + long pages; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + pages -= change_prot_numa(vma, start, end); + + start = end; + if (pages <= 0) + goto out; + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -1265,7 +1492,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) } __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); - update_cfs_shares(cfs_rq); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1454,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -1475,8 +1699,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - account_entity_enqueue(cfs_rq, se); enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1549,6 +1774,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1568,8 +1794,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1583,7 +1809,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - se->on_rq = 0; + update_cfs_shares(cfs_rq); } /* @@ -2435,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -static void unthrottle_offline_cfs_rqs(struct rq *rq) +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2595,8 +2821,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); - update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -2656,8 +2882,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); - update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -3026,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_idle_sibling(struct task_struct *p, int target) { - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i; + int i = task_cpu(p); - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; + if (idle_cpu(target)) + return target; /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. + * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -3058,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target) goto next; for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) + if (i == target || !idle_cpu(i)) goto next; } @@ -5500,6 +5719,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); + update_rq_runnable_avg(rq, 1); } @@ -5837,11 +6059,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) { + for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); - /* update contribution to parent */ - update_entity_load_avg(se, 1); - } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -5873,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); return rr_interval; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e68e69ab917d..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) +#endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344e..127a2c4cf4ab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include <linux/slab.h> +int sched_rr_timeslice = RR_TIMESLICE; + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; @@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; int i, weight, more = 0; u64 rt_period; @@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq) return; delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; return 0; } @@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); } void init_sched_rt_class(void) @@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; @@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = RR_TIMESLICE; + p->rt.time_slice = sched_rr_timeslice; /* * Requeue to the end of queue if we (and all of our ancestors) are the @@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return RR_TIMESLICE; + return sched_rr_timeslice; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5eca173b563f..cc03cfdf469f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,7 @@ #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -663,6 +665,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e8872..e036eda1a9c9 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v) if (mask_str == NULL) return -ENOMEM; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); /* runqueue-specific stats */ seq_printf(seq, @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v) return 0; } -static int schedstat_open(struct inode *inode, struct file *file) +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) { - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; + unsigned long n = *offset; - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &schedstat_sops); } +static int schedstat_release(struct inode *inode, struct file *file) +{ + return 0; +}; + static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = schedstat_release, }; static int __init proc_schedstat_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; + struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, regs, -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, regs); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); goto skip; + } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; + if (syscall_get_nr(current, regs) < 0) + goto skip; /* Explicit request to skip. */ + return 0; case SECCOMP_RET_ALLOW: return 0; diff --git a/kernel/signal.c b/kernel/signal.c index a49c7f36ceb3..2ec870a4c3c4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -31,6 +31,7 @@ #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> +#include <linux/compat.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -679,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * No need to set need_resched since signal event passing * goes through ->blocked */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state) { - unsigned int mask; - set_tsk_thread_flag(t, TIF_SIGPENDING); - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable + * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) + if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } @@ -843,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* @@ -1162,11 +1157,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk("%s/%d: potentially unexpected fatal signal %d.\n", + printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", current->comm, task_pid_nr(current), signr); #if defined(__i386__) && !defined(__arch_um__) - printk("code at %08lx: ", regs->ip); + printk(KERN_INFO "code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1174,11 +1169,11 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk("%02x ", insn); + printk(KERN_CONT "%02x ", insn); } } + printk(KERN_CONT "\n"); #endif - printk("\n"); preempt_disable(); show_regs(regs); preempt_enable(); @@ -1637,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) unsigned long flags; struct sighand_struct *psig; bool autoreap = false; + cputime_t utime, stime; BUG_ON(sig == -1); @@ -1674,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); - info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); + info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1739,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; + cputime_t utime, stime; if (for_ptracer) { parent = tsk->parent; @@ -1753,12 +1751,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(tsk->utime); - info.si_stime = cputime_to_clock_t(tsk->stime); + task_cputime(tsk, &utime, &stime); + info.si_utime = cputime_to_clock_t(utime); + info.si_stime = cputime_to_clock_t(stime); info.si_code = why; switch (why) { @@ -1799,6 +1798,10 @@ static inline int may_ptrace_stop(void) * If SIGKILL was already sent before the caller unlocked * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). + * + * This is almost outdated, a task with the pending SIGKILL can't + * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported + * after SIGKILL was already dequeued. */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) @@ -1924,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done) do_notify_parent_cldstop(current, false, why); + /* tasklist protects us from ptrace_freeze_traced() */ __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; @@ -2395,6 +2399,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, tracehook_signal_handler(sig, info, ka, regs, stepping); } +void signal_setup_done(int failed, struct ksignal *ksig, int stepping) +{ + if (failed) + force_sigsegv(ksig->sig, current); + else + signal_delivered(ksig->sig, &ksig->info, &ksig->ka, + signal_pt_regs(), stepping); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take @@ -2527,11 +2540,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) */ void set_current_blocked(sigset_t *newset) { - struct task_struct *tsk = current; sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(&tsk->sighand->siglock); - __set_task_blocked(tsk, newset); - spin_unlock_irq(&tsk->sighand->siglock); + __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) @@ -2615,28 +2625,58 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, return 0; } -long do_sigpending(void __user *set, unsigned long sigsetsize) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, + compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { - long error = -EINVAL; - sigset_t pending; +#ifdef __BIG_ENDIAN + sigset_t old_set = current->blocked; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (nset) { + compat_sigset_t new32; + sigset_t new_set; + int error; + if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&new_set, &new32); + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + error = sigprocmask(how, &new_set, NULL); + if (error) + return error; + } + if (oset) { + compat_sigset_t old32; + sigset_to_compat(&old32, &old_set); + if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) + return -EFAULT; + } + return 0; +#else + return sys_rt_sigprocmask(how, (sigset_t __user *)nset, + (sigset_t __user *)oset, sigsetsize); +#endif +} +#endif + +static int do_sigpending(void *set, unsigned long sigsetsize) +{ if (sigsetsize > sizeof(sigset_t)) - goto out; + return -EINVAL; spin_lock_irq(¤t->sighand->siglock); - sigorsets(&pending, ¤t->pending.signal, + sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); spin_unlock_irq(¤t->sighand->siglock); /* Outside the lock because only this thread touches it. */ - sigandsets(&pending, ¤t->blocked, &pending); - - error = -EFAULT; - if (!copy_to_user(set, &pending, sigsetsize)) - error = 0; - -out: - return error; + sigandsets(set, ¤t->blocked, set); + return 0; } /** @@ -2645,11 +2685,36 @@ out: * @set: stores pending signals * @sigsetsize: size of sigset_t type or larger */ -SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { - return do_sigpending(set, sigsetsize); + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err && copy_to_user(uset, &set, sigsetsize)) + err = -EFAULT; + return err; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, + compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t set; + int err = do_sigpending(&set, sigsetsize); + if (!err) { + compat_sigset_t set32; + sigset_to_compat(&set32, &set); + /* we can get here only if sigsetsize <= sizeof(set) */ + if (copy_to_user(uset, &set32, sigsetsize)) + err = -EFAULT; + } + return err; +#else + return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); +#endif +} +#endif + #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) @@ -2926,6 +2991,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) +{ + /* Not even root can pretend to send signals from the kernel. + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); + return -EPERM; + } + info->si_signo = sig; + + /* POSIX.1b doesn't mention process groups. */ + return kill_proc_info(sig, info, pid); +} + /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread @@ -2936,25 +3018,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { siginfo_t info; - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) return -EFAULT; + return do_rt_sigqueueinfo(pid, sig, &info); +} - /* Not even root can pretend to send signals from the kernel. - * Nor can they impersonate a kill()/tgkill(), which adds source info. - */ - if (info.si_code >= 0 || info.si_code == SI_TKILL) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info.si_code < 0); - return -EPERM; - } - info.si_signo = sig; - - /* POSIX.1b doesn't mention process groups. */ - return kill_proc_info(sig, &info, pid); +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + int ret = copy_siginfo_from_user32(&info, uinfo); + if (unlikely(ret)) + return ret; + return do_rt_sigqueueinfo(pid, sig, &info); } +#endif -long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -2963,7 +3046,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info->si_code >= 0 || info->si_code == SI_TKILL) { + if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && + (task_pid_vnr(current) != pid)) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info->si_code < 0); return -EPERM; @@ -2984,6 +3068,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, + compat_pid_t, tgid, + compat_pid_t, pid, + int, sig, + struct compat_siginfo __user *, uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} +#endif + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; @@ -3029,7 +3128,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; @@ -3094,6 +3193,76 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s out: return error; } +SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) +{ + return do_sigaltstack(uss, uoss, current_user_stack_pointer()); +} + +int restore_altstack(const stack_t __user *uss) +{ + int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + /* squash all but EFAULT for now */ + return err == -EFAULT ? err : 0; +} + +int __save_altstack(stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) +{ + stack_t uss, uoss; + int ret; + mm_segment_t seg; + + if (uss_ptr) { + compat_stack_t uss32; + + memset(&uss, 0, sizeof(stack_t)); + if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) + return -EFAULT; + uss.ss_sp = compat_ptr(uss32.ss_sp); + uss.ss_flags = uss32.ss_flags; + uss.ss_size = uss32.ss_size; + } + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), + (stack_t __force __user *) &uoss, + compat_user_stack_pointer()); + set_fs(seg); + if (ret >= 0 && uoss_ptr) { + if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || + __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || + __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || + __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + ret = -EFAULT; + } + return ret; +} + +int compat_restore_altstack(const compat_stack_t __user *uss) +{ + int err = compat_sys_sigaltstack(uss, NULL); + /* squash all but -EFAULT for now */ + return err == -EFAULT ? err : 0; +} + +int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} +#endif #ifdef __ARCH_WANT_SYS_SIGPENDING @@ -3103,7 +3272,7 @@ out: */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { - return do_sigpending(set, sizeof(*set)); + return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } #endif @@ -3130,7 +3299,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); new_blocked = current->blocked; @@ -3148,7 +3316,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - __set_current_blocked(&new_blocked); + set_current_blocked(&new_blocked); } if (oset) { @@ -3160,7 +3328,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ -#ifdef __ARCH_WANT_SYS_RT_SIGACTION +#ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent @@ -3194,7 +3362,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, out: return ret; } -#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct compat_sigaction __user *, act, + struct compat_sigaction __user *, oact, + compat_size_t, sigsetsize) +{ + struct k_sigaction new_ka, old_ka; + compat_sigset_t mask; +#ifdef __ARCH_HAS_SA_RESTORER + compat_uptr_t restorer; +#endif + int ret; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + + if (act) { + compat_uptr_t handler; + ret = get_user(handler, &act->sa_handler); + new_ka.sa.sa_handler = compat_ptr(handler); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= get_user(restorer, &act->sa_restorer); + new_ka.sa.sa_restorer = compat_ptr(restorer); +#endif + ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + if (ret) + return -EFAULT; + sigset_from_compat(&new_ka.sa.sa_mask, &mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + if (!ret && oact) { + sigset_to_compat(&mask, &old_ka.sa.sa_mask); + ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler); + ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +#ifdef __ARCH_HAS_SA_RESTORER + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer); +#endif + } + return ret; +} +#endif +#endif /* !CONFIG_ODD_RT_SIGACTION */ + +#ifdef CONFIG_OLD_SIGACTION +SYSCALL_DEFINE3(sigaction, int, sig, + const struct old_sigaction __user *, act, + struct old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + + return ret; +} +#endif +#ifdef CONFIG_COMPAT_OLD_SIGACTION +COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, + const struct compat_old_sigaction __user *, act, + struct compat_old_sigaction __user *, oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + compat_old_sigset_t mask; + compat_uptr_t handler, restorer; + + if (act) { + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(handler, &act->sa_handler) || + __get_user(restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; + +#ifdef __ARCH_HAS_KA_RESTORER + new_ka.ka_restorer = NULL; +#endif + new_ka.sa.sa_handler = compat_ptr(handler); + new_ka.sa.sa_restorer = compat_ptr(restorer); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + return ret; +} +#endif #ifdef __ARCH_WANT_SYS_SGETMASK @@ -3212,6 +3505,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; + siginitset(&newset, newmask); set_current_blocked(&newset); return old; @@ -3261,7 +3555,6 @@ int sigsuspend(sigset_t *set) return -ERESTARTNOHAND; } -#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received @@ -3280,7 +3573,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; return sigsuspend(&newset); } -#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) +{ +#ifdef __BIG_ENDIAN + sigset_t newset; + compat_sigset_t newset32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&newset, &newset32); + return sigsuspend(&newset); +#else + /* on little-endian bitmaps don't care about granularity */ + return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); +#endif +} +#endif + +#ifdef CONFIG_OLD_SIGSUSPEND +SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif +#ifdef CONFIG_OLD_SIGSUSPEND3 +SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) +{ + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); +} +#endif __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) { diff --git a/kernel/smp.c b/kernel/smp.c index 29dd40a9f2f4..8e451f3ff51b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,23 +16,14 @@ #include "smpboot.h" #ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static struct { - struct list_head queue; - raw_spinlock_t lock; -} call_function __cacheline_aligned_in_smp = - { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), - }; - enum { CSD_FLAG_LOCK = 0x01, }; struct call_function_data { - struct call_single_data csd; - atomic_t refs; + struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) + return notifier_from_errno(-ENOMEM); + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { + free_cpumask_var(cfd->cpumask); + return notifier_from_errno(-ENOMEM); + } break; #ifdef CONFIG_HOTPLUG_CPU @@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); + free_percpu(cfd->csd); break; #endif }; @@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) } /* - * Invoked by arch to handle an IPI for call function. Must be called with - * interrupts disabled. - */ -void generic_smp_call_function_interrupt(void) -{ - struct call_function_data *data; - int cpu = smp_processor_id(); - - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(cpu)); - - /* - * Ensure entry is visible on call_function_queue after we have - * entered the IPI. See comment in smp_call_function_many. - * If we don't have this, then we may miss an entry on the list - * and never get another IPI to process it. - */ - smp_mb(); - - /* - * It's ok to use list_for_each_rcu() here even though we may - * delete 'pos', since list_del_rcu() doesn't clear ->next - */ - list_for_each_entry_rcu(data, &call_function.queue, csd.list) { - int refs; - smp_call_func_t func; - - /* - * Since we walk the list without any locks, we might - * see an entry that was completed, removed from the - * list and is in the process of being reused. - * - * We must check that the cpu is in the cpumask before - * checking the refs, and both must be set before - * executing the callback on this cpu. - */ - - if (!cpumask_test_cpu(cpu, data->cpumask)) - continue; - - smp_rmb(); - - if (atomic_read(&data->refs) == 0) - continue; - - func = data->csd.func; /* save for later warn */ - func(data->csd.info); - - /* - * If the cpu mask is not still set then func enabled - * interrupts (BUG), and this cpu took another smp call - * function interrupt and executed func(info) twice - * on this cpu. That nested execution decremented refs. - */ - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pf enabled interrupts and double executed\n", func); - continue; - } - - refs = atomic_dec_return(&data->refs); - WARN_ON(refs < 0); - - if (refs) - continue; - - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - - csd_unlock(&data->csd); - } - -} - -/* * Invoked by arch to handle an IPI for call function single. Must be * called from the arch with interrupts disabled. */ @@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; - unsigned long flags; - int refs, cpu, next_cpu, this_cpu = smp_processor_id(); + int cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask, } data = &__get_cpu_var(cfd_data); - csd_lock(&data->csd); - - /* This BUG_ON verifies our reuse assertions and can be removed */ - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); - /* - * The global call function queue list add and delete are protected - * by a lock, but the list is traversed without any lock, relying - * on the rcu list add and delete to allow safe concurrent traversal. - * We reuse the call function data without waiting for any grace - * period after some other cpu removes it from the global queue. - * This means a cpu might find our data block as it is being - * filled out. - * - * We hold off the interrupt handler on the other cpu by - * ordering our writes to the cpu mask vs our setting of the - * refs counter. We assert only the cpu owning the data block - * will set a bit in cpumask, and each bit will only be cleared - * by the subject cpu. Each cpu must first find its bit is - * set and then check that refs is set indicating the element is - * ready to be processed, otherwise it must skip the entry. - * - * On the previous iteration refs was set to 0 by another cpu. - * To avoid the use of transitivity, set the counter to 0 here - * so the wmb will pair with the rmb in the interrupt handler. - */ - atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ - - data->csd.func = func; - data->csd.info = info; - - /* Ensure 0 refs is visible before mask. Also orders func and info */ - smp_wmb(); - - /* We rely on the "and" being processed before the store */ cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - refs = cpumask_weight(data->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!refs)) { - csd_unlock(&data->csd); + if (unlikely(!cpumask_weight(data->cpumask))) return; - } - raw_spin_lock_irqsave(&call_function.lock, flags); /* - * Place entry at the _HEAD_ of the list, so that any cpu still - * observing the entry in generic_smp_call_function_interrupt() - * will not miss any other list entries: + * After we put an entry into the list, data->cpumask + * may be cleared again when another CPU sends another IPI for + * a SMP function call, so data->cpumask will be zero. */ - list_add_rcu(&data->csd.list, &call_function.queue); - /* - * We rely on the wmb() in list_add_rcu to complete our writes - * to the cpumask before this write to refs, which indicates - * data is on the list and is ready to be processed. - */ - atomic_set(&data->refs, refs); - raw_spin_unlock_irqrestore(&call_function.lock, flags); + cpumask_copy(data->cpumask_ipi, data->cpumask); - /* - * Make the list addition visible before sending the ipi. - * (IPIs must obey or appear to obey normal Linux cache - * coherency rules -- see comment in generic_exec_single). - */ - smp_mb(); + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + struct call_single_queue *dst = + &per_cpu(call_single_queue, cpu); + unsigned long flags; + + csd_lock(csd); + csd->func = func; + csd->info = info; + + raw_spin_lock_irqsave(&dst->lock, flags); + list_add_tail(&csd->list, &dst->list); + raw_spin_unlock_irqrestore(&dst->lock, flags); + } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask); + arch_send_call_function_ipi_mask(data->cpumask_ipi); - /* Optionally wait for the CPUs to complete */ - if (wait) - csd_lock_wait(&data->csd); + if (wait) { + for_each_cpu(cpu, data->cpumask) { + struct call_single_data *csd = + per_cpu_ptr(data->csd, cpu); + csd_lock_wait(csd); + } + } } EXPORT_SYMBOL(smp_call_function_many); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d6c5fc054242..b9bde5727829 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data) continue; } - BUG_ON(td->cpu != smp_processor_id()); + //BUG_ON(td->cpu != smp_processor_id()); /* Check for state change setup */ switch (td->status) { @@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } - get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; + if (ht->create) + ht->create(cpu); return 0; } @@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) + if (tsk && !ht->selfparking) kthread_park(tsk); } diff --git a/kernel/softirq.c b/kernel/softirq.c index ed567babe789..b4d252fd195b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing MAX_SOFTIRQ_RESTART times, - * and we fall back to softirqd after that. + * We restart softirq processing for at most 2 ms, + * and if need_resched() is not set. * - * This number has been established via experimentation. + * These limits have been established via experimentation. * The two things to balance is latency against fairness - * we want to handle softirqs as soon as possible, but they * should not be able to lock up the box. */ -#define MAX_SOFTIRQ_RESTART 10 +#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) asmlinkage void __do_softirq(void) { struct softirq_action *h; __u32 pending; - int max_restart = MAX_SOFTIRQ_RESTART; + unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - vtime_account_irq_enter(current); + account_irq_enter_time(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -264,15 +264,16 @@ restart: local_irq_disable(); pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; + if (pending) { + if (time_before(jiffies, end) && !need_resched()) + goto restart; - if (pending) wakeup_softirqd(); + } lockdep_softirq_exit(); - vtime_account_irq_exit(current); + account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +342,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - vtime_account_irq_exit(current); + account_irq_exit_time(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) diff --git a/kernel/srcu.c b/kernel/srcu.c index 2b859828cdc3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp) */ void cleanup_srcu_struct(struct srcu_struct *sp) { - int sum; - - sum = srcu_readers_active(sp); - WARN_ON(sum); /* Leakage unless caller handles error. */ - if (sum != 0) - return; + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } @@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp) { int idx; + idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - idx = rcu_dereference_index_check(sp->completed, - rcu_read_lock_sched_held()) & 0x1; ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; smp_mb(); /* B */ /* Avoid leaking the critical section. */ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; @@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { - preempt_disable(); smp_mb(); /* C */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; - preempt_enable(); + this_cpu_dec(sp->per_cpu_ref->c[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) !lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + might_sleep(); init_completion(&rcu.completion); head->next = NULL; @@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * synchronize_srcu - wait for prior SRCU read-side critical-section completion * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->completed & 1) ^ 1) to drain to zero at first, + * and then flip the completed and wait for the count of the other index. + * + * Can block; must be called from process context. * * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. @@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. It is also illegal to call - * synchronize_srcu_expedited() from the corresponding SRCU read-side - * critical section; doing so will result in deadlock. However, it is - * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct - * from some other srcu_struct's read-side critical section, as long as + * Note that it is also illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; + * doing so will result in deadlock. However, it is perfectly legal + * to call synchronize_srcu_expedited() on one srcu_struct from some + * other srcu_struct's read-side critical section, as long as * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2f194e965715..95d178c62d5a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -18,7 +18,7 @@ #include <linux/stop_machine.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> - +#include <linux/smpboot.h> #include <linux/atomic.h> /* @@ -37,10 +37,10 @@ struct cpu_stopper { spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ - struct task_struct *thread; /* stopper thread */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); +static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) @@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) } /* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) +static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + struct task_struct *p = per_cpu(cpu_stopper_task, cpu); + unsigned long flags; spin_lock_irqsave(&stopper->lock, flags); if (stopper->enabled) { list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); + wake_up_process(p); } else cpu_stop_signal_done(work->done, false); @@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); + cpu_stop_queue_work(cpu, &work); wait_for_completion(&done.completion); return done.executed ? done.ret : -ENOENT; } @@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); + cpu_stop_queue_work(cpu, work_buf); } /* static data for stop_cpus */ @@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, */ preempt_disable(); for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), - &per_cpu(stop_cpus_work, cpu)); + cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); preempt_enable(); } @@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) return ret; } -static int cpu_stopper_thread(void *data) +static int cpu_stop_should_run(unsigned int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + unsigned long flags; + int run; + + spin_lock_irqsave(&stopper->lock, flags); + run = !list_empty(&stopper->works); + spin_unlock_irqrestore(&stopper->lock, flags); + return run; +} + +static void cpu_stopper_thread(unsigned int cpu) { - struct cpu_stopper *stopper = data; + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; int ret; repeat: - set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - work = NULL; spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { @@ -273,8 +279,6 @@ repeat: struct cpu_stop_done *done = work->done; char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - __set_current_state(TASK_RUNNING); - /* cpu stop callbacks are not allowed to sleep */ preempt_disable(); @@ -290,88 +294,55 @@ repeat: ksym_buf), arg); cpu_stop_signal_done(done, true); - } else - schedule(); - - goto repeat; + goto repeat; + } } extern void sched_set_stop_task(int cpu, struct task_struct *stop); -/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ -static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void cpu_stop_create(unsigned int cpu) +{ + sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); +} + +static void cpu_stop_park(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - BUG_ON(stopper->thread || stopper->enabled || - !list_empty(&stopper->works)); - p = kthread_create_on_node(cpu_stopper_thread, - stopper, - cpu_to_node(cpu), - "migration/%d", cpu); - if (IS_ERR(p)) - return notifier_from_errno(PTR_ERR(p)); - get_task_struct(p); - kthread_bind(p, cpu); - sched_set_stop_task(cpu, p); - stopper->thread = p; - break; - - case CPU_ONLINE: - /* strictly unnecessary, as first user will wake it */ - wake_up_process(stopper->thread); - /* mark enabled */ - spin_lock_irq(&stopper->lock); - stopper->enabled = true; - spin_unlock_irq(&stopper->lock); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_POST_DEAD: - { - struct cpu_stop_work *work; - - sched_set_stop_task(cpu, NULL); - /* kill the stopper */ - kthread_stop(stopper->thread); - /* drain remaining works */ - spin_lock_irq(&stopper->lock); - list_for_each_entry(work, &stopper->works, list) - cpu_stop_signal_done(work->done, false); - stopper->enabled = false; - spin_unlock_irq(&stopper->lock); - /* release the stopper */ - put_task_struct(stopper->thread); - stopper->thread = NULL; - break; - } -#endif - } + struct cpu_stop_work *work; + unsigned long flags; - return NOTIFY_OK; + /* drain remaining works */ + spin_lock_irqsave(&stopper->lock, flags); + list_for_each_entry(work, &stopper->works, list) + cpu_stop_signal_done(work->done, false); + stopper->enabled = false; + spin_unlock_irqrestore(&stopper->lock, flags); } -/* - * Give it a higher priority so that cpu stopper is available to other - * cpu notifiers. It currently shares the same priority as sched - * migration_notifier. - */ -static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { - .notifier_call = cpu_stop_cpu_callback, - .priority = 10, +static void cpu_stop_unpark(unsigned int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + spin_lock_irq(&stopper->lock); + stopper->enabled = true; + spin_unlock_irq(&stopper->lock); +} + +static struct smp_hotplug_thread cpu_stop_threads = { + .store = &cpu_stopper_task, + .thread_should_run = cpu_stop_should_run, + .thread_fn = cpu_stopper_thread, + .thread_comm = "migration/%u", + .create = cpu_stop_create, + .setup = cpu_stop_unpark, + .park = cpu_stop_park, + .unpark = cpu_stop_unpark, + .selfparking = true, }; static int __init cpu_stop_init(void) { - void *bcpu = (void *)(long)smp_processor_id(); unsigned int cpu; - int err; for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); @@ -380,15 +351,8 @@ static int __init cpu_stop_init(void) INIT_LIST_HEAD(&stopper->works); } - /* start one for the boot cpu */ - err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, - bcpu); - BUG_ON(err != NOTIFY_OK); - cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); - register_cpu_notifier(&cpu_stop_cpu_notifier); - + BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_initialized = true; - return 0; } early_initcall(cpu_stop_init); diff --git a/kernel/sys.c b/kernel/sys.c index 265b37690421..81f56445fba9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ @@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex); SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); char buffer[256]; int ret = 0; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) return -EPERM; /* For safety, we require "magic" arguments. */ @@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, * pid_namespace, the command is handled by reboot_pid_ns() which will * call do_exit(). */ - ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + ret = reboot_pid_ns(pid_ns, cmd); if (ret) return ret; @@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct dentry *dentry; + struct inode *inode; int err; exe = fdget(fd); if (!exe.file) return -EBADF; - dentry = exe.file->f_path.dentry; + inode = file_inode(exe.file); /* * Because the original mm->exe_file points to executable file, make @@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(dentry->d_inode->i_mode) || + if (!S_ISREG(inode->i_mode) || exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - err = inode_permission(dentry->d_inode, MAY_EXEC); + err = inode_permission(inode, MAY_EXEC); if (err) goto exit; @@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = 0; switch (option) { - case PR_SET_PDEATHSIG: - if (!valid_signal(arg2)) { - error = -EINVAL; - break; - } - me->pdeath_signal = arg2; - break; - case PR_GET_PDEATHSIG: - error = put_user(me->pdeath_signal, (int __user *)arg2); - break; - case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; break; - case PR_SET_DUMPABLE: - if (arg2 < 0 || arg2 > 1) { - error = -EINVAL; - break; - } - set_dumpable(me->mm, arg2); + } + me->pdeath_signal = arg2; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + error = -EINVAL; break; + } + set_dumpable(me->mm, arg2); + break; - case PR_SET_UNALIGN: - error = SET_UNALIGN_CTL(me, arg2); - break; - case PR_GET_UNALIGN: - error = GET_UNALIGN_CTL(me, arg2); - break; - case PR_SET_FPEMU: - error = SET_FPEMU_CTL(me, arg2); - break; - case PR_GET_FPEMU: - error = GET_FPEMU_CTL(me, arg2); - break; - case PR_SET_FPEXC: - error = SET_FPEXC_CTL(me, arg2); - break; - case PR_GET_FPEXC: - error = GET_FPEXC_CTL(me, arg2); - break; - case PR_GET_TIMING: - error = PR_TIMING_STATISTICAL; - break; - case PR_SET_TIMING: - if (arg2 != PR_TIMING_STATISTICAL) - error = -EINVAL; - break; - case PR_SET_NAME: - comm[sizeof(me->comm)-1] = 0; - if (strncpy_from_user(comm, (char __user *)arg2, - sizeof(me->comm) - 1) < 0) - return -EFAULT; - set_task_comm(me, comm); - proc_comm_connector(me); - break; - case PR_GET_NAME: - get_task_comm(comm, me); - if (copy_to_user((char __user *)arg2, comm, - sizeof(comm))) - return -EFAULT; - break; - case PR_GET_ENDIAN: - error = GET_ENDIAN(me, arg2); - break; - case PR_SET_ENDIAN: - error = SET_ENDIAN(me, arg2); - break; - case PR_GET_SECCOMP: - error = prctl_get_seccomp(); - break; - case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2, (char __user *)arg3); - break; - case PR_GET_TSC: - error = GET_TSC_CTL(arg2); - break; - case PR_SET_TSC: - error = SET_TSC_CTL(arg2); - break; - case PR_TASK_PERF_EVENTS_DISABLE: - error = perf_event_task_disable(); - break; - case PR_TASK_PERF_EVENTS_ENABLE: - error = perf_event_task_enable(); - break; - case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; - break; - case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + break; + case PR_SET_NAME: + comm[sizeof(me->comm) - 1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + proc_comm_connector(me); + break; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) + return -EFAULT; + break; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2, (char __user *)arg3); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); + break; + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = current->default_timer_slack_ns; - else - current->timer_slack_ns = arg2; - break; - case PR_MCE_KILL: - if (arg4 | arg5) - return -EINVAL; - switch (arg2) { - case PR_MCE_KILL_CLEAR: - if (arg3 != 0) - return -EINVAL; - current->flags &= ~PF_MCE_PROCESS; - break; - case PR_MCE_KILL_SET: - current->flags |= PF_MCE_PROCESS; - if (arg3 == PR_MCE_KILL_EARLY) - current->flags |= PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_LATE) - current->flags &= ~PF_MCE_EARLY; - else if (arg3 == PR_MCE_KILL_DEFAULT) - current->flags &= - ~(PF_MCE_EARLY|PF_MCE_PROCESS); - else - return -EINVAL; - break; - default: + else + current->timer_slack_ns = arg2; + break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) return -EINVAL; - } + current->flags &= ~PF_MCE_PROCESS; break; - case PR_MCE_KILL_GET: - if (arg2 | arg3 | arg4 | arg5) - return -EINVAL; - if (current->flags & PF_MCE_PROCESS) - error = (current->flags & PF_MCE_EARLY) ? - PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); else - error = PR_MCE_KILL_DEFAULT; - break; - case PR_SET_MM: - error = prctl_set_mm(arg2, arg3, arg4, arg5); - break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; - case PR_SET_CHILD_SUBREAPER: - me->signal->is_child_subreaper = !!arg2; - break; - case PR_GET_CHILD_SUBREAPER: - error = put_user(me->signal->is_child_subreaper, - (int __user *) arg2); - break; - case PR_SET_NO_NEW_PRIVS: - if (arg2 != 1 || arg3 || arg4 || arg5) return -EINVAL; - - current->no_new_privs = 1; break; - case PR_GET_NO_NEW_PRIVS: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - return current->no_new_privs ? 1 : 0; default: - error = -EINVAL; - break; + return -EINVAL; + } + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; + case PR_SET_MM: + error = prctl_set_mm(arg2, arg3, arg4, arg5); + break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *)arg2); + break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; + default: + error = -EINVAL; + break; } return error; } @@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static void argv_cleanup(struct subprocess_info *info) -{ - argv_free(info->argv); -} - static int __orderly_poweroff(void) { int argc; @@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void) } ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, argv_cleanup, NULL); - if (ret == -ENOMEM) - argv_free(argv); + NULL, NULL, NULL); + argv_free(argv); return ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); +cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); cond_syscall(sys_bind); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33f71f37267e..afc1dc60f3f8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> +#include <linux/sched/sysctl.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -104,7 +105,6 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; -extern int min_free_kbytes; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; @@ -157,14 +157,20 @@ extern int sysctl_tsb_ratio; #ifdef __hppa__ extern int pwrsw_enabled; +#endif + +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW extern int unaligned_enabled; #endif #ifdef CONFIG_IA64 -extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -256,9 +262,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +309,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +356,45 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, @@ -362,6 +409,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", @@ -504,6 +558,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", .data = &unaligned_enabled, @@ -870,7 +926,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_doulongvec_minmax, }, #endif -#ifdef CONFIG_IA64 +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, @@ -878,6 +934,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 { .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, @@ -1965,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write, int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) - add_taint(i); + add_taint(i, LOCKDEP_STILL_OK); } } @@ -2042,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMPABLE_SAFE && + if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..ebf72358e86a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_ABC, "tcp_abc" }, { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, @@ -971,7 +970,6 @@ out: static ssize_t bin_intvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file, if (oldval && oldlen) { unsigned __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file, if (newval && newlen) { unsigned __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1049,7 +1041,6 @@ out: static ssize_t bin_ulongvec(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t copied = 0; char *buffer; ssize_t result; @@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file, if (oldval && oldlen) { unsigned long __user *vec = oldval; size_t length = oldlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buffer, BUFSZ - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buffer, BUFSZ - 1); if (result < 0) goto out_kfree; @@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file, if (newval && newlen) { unsigned long __user *vec = newval; size_t length = newlen / sizeof(*vec); - loff_t pos = 0; char *str, *end; int i; @@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file, str += snprintf(str, end - str, "%lu\t", value); } - set_fs(KERNEL_DS); - result = vfs_write(file, buffer, str - buffer, &pos); - set_fs(old_fs); + result = kernel_write(file, buffer, str - buffer, 0); if (result < 0) goto out_kfree; } @@ -1127,19 +1112,15 @@ out: static ssize_t bin_uuid(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; /* Only supports reads */ if (oldval && oldlen) { - loff_t pos = 0; char buf[40], *str = buf; unsigned char uuid[16]; int i; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1175,18 +1156,14 @@ out: static ssize_t bin_dn_node_address(struct file *file, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { - mm_segment_t old_fs = get_fs(); ssize_t result, copied = 0; if (oldval && oldlen) { - loff_t pos = 0; char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; - set_fs(KERNEL_DS); - result = vfs_read(file, buf, sizeof(buf) - 1, &pos); - set_fs(old_fs); + result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) goto out; @@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file, /* Convert the decnet address to binary */ result = -EIO; - nodep = strchr(buf, '.') + 1; + nodep = strchr(buf, '.'); if (!nodep) goto out; + ++nodep; area = simple_strtoul(buf, NULL, 10); node = simple_strtoul(nodep, NULL, 10); @@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file, } if (newval && newlen) { - loff_t pos = 0; __le16 dnaddr; char buf[15]; int len; @@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - set_fs(KERNEL_DS); - result = vfs_write(file, buf, len, &pos); - set_fs(old_fs); + result = kernel_write(file, buf, len, 0); if (result < 0) goto out; } @@ -1344,7 +1319,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) diff --git a/kernel/time.c b/kernel/time.c index d226c6a3fd28..f8342a41efa6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, } /* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * @@ -135,6 +141,8 @@ static inline void warp_clock(void) struct timespec adjust; adjust = current_kernel_time(); + if (sys_tz.tz_minuteswest != 0) + persistent_clock_is_local = 1; adjust.tv_sec += sys_tz.tz_minuteswest * 60; do_settimeofday(&adjust); } @@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -inline unsigned int jiffies_to_msecs(const unsigned long j) +unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -inline unsigned int jiffies_to_usecs(const unsigned long j) +unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8601f0db1261..24510d84efd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool +# Platforms has a persistent clock +config ALWAYS_USE_PERSISTENT_CLOCK + bool + default n + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD default y depends on GENERIC_CLOCKEVENTS +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + # Clockevents broadcasting infrastructure config GENERIC_CLOCKEVENTS_BROADCAST bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 30b6de0d977c..c6d6400ee137 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev, clockevents_config(dev, freq); clockevents_register_device(dev); } +EXPORT_SYMBOL_GPL(clockevents_config_and_register); /** * clockevents_update_freq - Update frequency and reprogram a clock event device. diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 24174b4d669b..072bb066bb7d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/rtc.h> #include "tick-internal.h" @@ -22,7 +23,7 @@ * NTP timekeeping variables: */ -DEFINE_SPINLOCK(ntp_lock); +DEFINE_RAW_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ @@ -347,7 +348,7 @@ void ntp_clear(void) { unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; @@ -361,7 +362,7 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } @@ -371,9 +372,9 @@ u64 ntp_tick_length(void) unsigned long flags; s64 ret; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); ret = tick_length; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return ret; } @@ -394,7 +395,7 @@ int second_overflow(unsigned long secs) int leap = 0; unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -478,13 +479,12 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return leap; } -#ifdef CONFIG_GENERIC_CMOS_UPDATE - +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) - fail = update_persistent_clock(now); + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + struct timespec adjust = now; + + fail = -ENODEV; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +#ifdef CONFIG_GENERIC_CMOS_UPDATE + fail = update_persistent_clock(adjust); +#endif +#ifdef CONFIG_RTC_SYSTOHC + if (fail == -ENODEV) + fail = rtc_set_ntp_time(adjust); +#endif + } next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; - if (!fail) + if (!fail || fail == -ENODEV) next.tv_sec = 659; else next.tv_sec = 0; @@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - spin_lock_irq(&ntp_lock); + raw_spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc) /* fill PPS status fields */ pps_fill_timex(txc); - spin_unlock_irq(&ntp_lock); + raw_spin_unlock_irq(&ntp_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; @@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - spin_lock_irqsave(&ntp_lock, flags); + raw_spin_lock_irqsave(&ntp_lock, flags); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - spin_unlock_irqrestore(&ntp_lock, flags); + raw_spin_unlock_irqrestore(&ntp_lock, flags); } EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f113755695e2..2fb8cb88df8d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,6 +18,7 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> +#include <linux/smp.h> #include "tick-internal.h" @@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + /* * Check, if the device is disfunctional and a place holder, which * needs to be handled by the broadcast device. @@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; + tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; @@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); tick_broadcast_clear_oneshot(cpu); + } else { + tick_device_setup_broadcast_func(dev); } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} +#endif + /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d58e552d9fd1..a19a39952c1b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/irq_work.h> #include <asm/irq_regs.h> @@ -28,7 +29,7 @@ /* * Per cpu nohz control structure */ -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* * The time, when the last jiffy update happened. Protected by jiffies_lock. @@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = timekeeping_max_deferment(); } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || + arch_needs_cpu(cpu) || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -553,6 +554,7 @@ void tick_nohz_idle_enter(void) local_irq_enable(); } +EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); /** * tick_nohz_irq_exit - update next tick event from interrupt exit @@ -631,8 +633,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; + + if (vtime_accounting_enabled()) + return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -681,6 +686,7 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } +EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbc6acb0db3f..9a0bc98fbe1d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -29,6 +29,9 @@ static struct timekeeper timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +/* Flag for if there is a persistent clock on this platform */ +bool __read_mostly persistent_clock_exist = false; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { @@ -135,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) } /* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 (*arch_gettimeoffset)(void); + +u32 get_arch_timeoffset(void) +{ + if (likely(arch_gettimeoffset)) + return arch_gettimeoffset(); + return 0; +} +#else +static inline u32 get_arch_timeoffset(void) { return 0; } +#endif + static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; @@ -151,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk) nsec = cycle_delta * tk->mult + tk->xtime_nsec; nsec >>= tk->shift; - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) @@ -171,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - /* If arch requires, add in gettimeoffset() */ - return nsec + arch_gettimeoffset(); + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + get_arch_timeoffset(); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); @@ -254,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; - /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; + /* If arch requires, add in get_arch_timeoffset() */ + tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; tk_normalize_xtime(tk); @@ -264,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * getnstimeofday - Returns the time of day in a timespec + * __getnstimeofday - Returns the time of day in a timespec. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined). */ -void getnstimeofday(struct timespec *ts) +int __getnstimeofday(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs = 0; - WARN_ON(timekeeping_suspended); - do { seq = read_seqbegin(&tk->lock); @@ -287,6 +303,26 @@ void getnstimeofday(struct timespec *ts) ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); + + /* + * Do not bail out early, in case there were callers still using + * the value, even in the face of the WARN_ON. + */ + if (unlikely(timekeeping_suspended)) + return -EAGAIN; + return 0; +} +EXPORT_SYMBOL(__getnstimeofday); + +/** + * getnstimeofday - Returns the time of day in a timespec. + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec (WARN if suspended). + */ +void getnstimeofday(struct timespec *ts) +{ + WARN_ON(__getnstimeofday(ts)); } EXPORT_SYMBOL(getnstimeofday); @@ -640,12 +676,14 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; now.tv_nsec = 0; - } + } else if (now.tv_sec || now.tv_nsec) + persistent_clock_exist = true; read_boot_clock(&boot); if (!timespec_valid_strict(&boot)) { @@ -718,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta) { struct timekeeper *tk = &timekeeper; unsigned long flags; - struct timespec ts; - /* Make sure we don't set the clock twice */ - read_persistent_clock(&ts); - if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) + /* + * Make sure we don't set the clock twice, as timekeeping_resume() + * already did it + */ + if (has_persistent_clock()) return; write_seqlock_irqsave(&tk->lock, flags); diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc new file mode 100644 index 000000000000..511bdf2cafda --- /dev/null +++ b/kernel/timeconst.bc @@ -0,0 +1,108 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include <linux/param.h>\n" + print "#include <linux/types.h>\n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +timeconst(hz) diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl deleted file mode 100644 index eb51d76e058a..000000000000 --- a/kernel/timeconst.pl +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/perl -# ----------------------------------------------------------------------- -# -# Copyright 2007-2008 rPath, Inc. - All Rights Reserved -# -# This file is part of the Linux kernel, and is made available under -# the terms of the GNU General Public License version 2 or (at your -# option) any later version; incorporated herein by reference. -# -# ----------------------------------------------------------------------- -# - -# -# Usage: timeconst.pl HZ > timeconst.h -# - -# Precomputed values for systems without Math::BigInt -# Generated by: -# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 -%canned_values = ( - 24 => [ - '0xa6aaaaab','0x2aaaaaa',26, - 125,3, - '0xc49ba5e4','0x1fbe76c8b4',37, - 3,125, - '0xa2c2aaab','0xaaaa',16, - 125000,3, - '0xc9539b89','0x7fffbce4217d',47, - 3,125000, - ], 32 => [ - '0xfa000000','0x6000000',27, - 125,4, - '0x83126e98','0xfdf3b645a',36, - 4,125, - '0xf4240000','0x0',17, - 31250,1, - '0x8637bd06','0x3fff79c842fa',46, - 1,31250, - ], 48 => [ - '0xa6aaaaab','0x6aaaaaa',27, - 125,6, - '0xc49ba5e4','0xfdf3b645a',36, - 6,125, - '0xa2c2aaab','0x15555',17, - 62500,3, - '0xc9539b89','0x3fffbce4217d',46, - 3,62500, - ], 64 => [ - '0xfa000000','0xe000000',28, - 125,8, - '0x83126e98','0x7ef9db22d',35, - 8,125, - '0xf4240000','0x0',18, - 15625,1, - '0x8637bd06','0x1fff79c842fa',45, - 1,15625, - ], 100 => [ - '0xa0000000','0x0',28, - 10,1, - '0xcccccccd','0x733333333',35, - 1,10, - '0x9c400000','0x0',18, - 10000,1, - '0xd1b71759','0x1fff2e48e8a7',45, - 1,10000, - ], 122 => [ - '0x8325c53f','0xfbcda3a',28, - 500,61, - '0xf9db22d1','0x7fbe76c8b',35, - 61,500, - '0x8012e2a0','0x3ef36',18, - 500000,61, - '0xffda4053','0x1ffffbce4217',45, - 61,500000, - ], 128 => [ - '0xfa000000','0x1e000000',29, - 125,16, - '0x83126e98','0x3f7ced916',34, - 16,125, - '0xf4240000','0x40000',19, - 15625,2, - '0x8637bd06','0xfffbce4217d',44, - 2,15625, - ], 200 => [ - '0xa0000000','0x0',29, - 5,1, - '0xcccccccd','0x333333333',34, - 1,5, - '0x9c400000','0x0',19, - 5000,1, - '0xd1b71759','0xfff2e48e8a7',44, - 1,5000, - ], 250 => [ - '0x80000000','0x0',29, - 4,1, - '0x80000000','0x180000000',33, - 1,4, - '0xfa000000','0x0',20, - 4000,1, - '0x83126e98','0x7ff7ced9168',43, - 1,4000, - ], 256 => [ - '0xfa000000','0x3e000000',30, - 125,32, - '0x83126e98','0x1fbe76c8b',33, - 32,125, - '0xf4240000','0xc0000',20, - 15625,4, - '0x8637bd06','0x7ffde7210be',43, - 4,15625, - ], 300 => [ - '0xd5555556','0x2aaaaaaa',30, - 10,3, - '0x9999999a','0x1cccccccc',33, - 3,10, - '0xd0555556','0xaaaaa',20, - 10000,3, - '0x9d495183','0x7ffcb923a29',43, - 3,10000, - ], 512 => [ - '0xfa000000','0x7e000000',31, - 125,64, - '0x83126e98','0xfdf3b645',32, - 64,125, - '0xf4240000','0x1c0000',21, - 15625,8, - '0x8637bd06','0x3ffef39085f',42, - 8,15625, - ], 1000 => [ - '0x80000000','0x0',31, - 1,1, - '0x80000000','0x0',31, - 1,1, - '0xfa000000','0x0',22, - 1000,1, - '0x83126e98','0x1ff7ced9168',41, - 1,1000, - ], 1024 => [ - '0xfa000000','0xfe000000',32, - 125,128, - '0x83126e98','0x7ef9db22',31, - 128,125, - '0xf4240000','0x3c0000',22, - 15625,16, - '0x8637bd06','0x1fff79c842f',41, - 16,15625, - ], 1200 => [ - '0xd5555556','0xd5555555',32, - 5,6, - '0x9999999a','0x66666666',31, - 6,5, - '0xd0555556','0x2aaaaa',22, - 2500,3, - '0x9d495183','0x1ffcb923a29',41, - 3,2500, - ] -); - -$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; - -sub bint($) -{ - my($x) = @_; - return Math::BigInt->new($x); -} - -# -# Constants for division by reciprocal multiplication. -# (bits, numerator, denominator) -# -sub fmul($$$) -{ - my ($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - return scalar (($n << $b)+$d-bint(1))/$d; -} - -sub fadj($$$) -{ - my($b,$n,$d) = @_; - - $n = bint($n); - $d = bint($d); - - $d = $d/bgcd($n, $d); - return scalar (($d-bint(1)) << $b)/$d; -} - -sub fmuls($$$) { - my($b,$n,$d) = @_; - my($s,$m); - my($thres) = bint(1) << ($b-1); - - $n = bint($n); - $d = bint($d); - - for ($s = 0; 1; $s++) { - $m = fmul($s,$n,$d); - return $s if ($m >= $thres); - } - return 0; -} - -# Generate a hex value if the result fits in 64 bits; -# otherwise skip. -sub bignum_hex($) { - my($x) = @_; - my $s = $x->as_hex(); - - return (length($s) > 18) ? undef : $s; -} - -# Provides mul, adj, and shr factors for a specific -# (bit, time, hz) combination -sub muladj($$$) { - my($b, $t, $hz) = @_; - my $s = fmuls($b, $t, $hz); - my $m = fmul($s, $t, $hz); - my $a = fadj($s, $t, $hz); - return (bignum_hex($m), bignum_hex($a), $s); -} - -# Provides numerator, denominator values -sub numden($$) { - my($n, $d) = @_; - my $g = bgcd($n, $d); - return ($n/$g, $d/$g); -} - -# All values for a specific (time, hz) combo -sub conversions($$) { - my ($t, $hz) = @_; - my @val = (); - - # HZ_TO_xx - push(@val, muladj(32, $t, $hz)); - push(@val, numden($t, $hz)); - - # xx_TO_HZ - push(@val, muladj(32, $hz, $t)); - push(@val, numden($hz, $t)); - - return @val; -} - -sub compute_values($) { - my($hz) = @_; - my @val = (); - my $s, $m, $a, $g; - - if (!$has_bigint) { - die "$0: HZ == $hz not canned and ". - "Math::BigInt not available\n"; - } - - # MSEC conversions - push(@val, conversions(1000, $hz)); - - # USEC conversions - push(@val, conversions(1000000, $hz)); - - return @val; -} - -sub outputval($$) -{ - my($name, $val) = @_; - my $csuf; - - if (defined($val)) { - if ($name !~ /SHR/) { - $val = "U64_C($val)"; - } - printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; - } -} - -sub output($@) -{ - my($hz, @val) = @_; - my $pfx, $bit, $suf, $s, $m, $a; - - print "/* Automatically generated by kernel/timeconst.pl */\n"; - print "/* Conversion constants for HZ == $hz */\n"; - print "\n"; - print "#ifndef KERNEL_TIMECONST_H\n"; - print "#define KERNEL_TIMECONST_H\n"; - print "\n"; - - print "#include <linux/param.h>\n"; - print "#include <linux/types.h>\n"; - - print "\n"; - print "#if HZ != $hz\n"; - print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; - print "#endif\n"; - print "\n"; - - foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'HZ_TO_USEC','USEC_TO_HZ') { - foreach $bit (32) { - foreach $suf ('MUL', 'ADJ', 'SHR') { - outputval("${pfx}_$suf$bit", shift(@val)); - } - } - foreach $suf ('NUM', 'DEN') { - outputval("${pfx}_$suf", shift(@val)); - } - } - - print "\n"; - print "#endif /* KERNEL_TIMECONST_H */\n"; -} - -# Pretty-print Perl values -sub perlvals(@) { - my $v; - my @l = (); - - foreach $v (@_) { - if (!defined($v)) { - push(@l, 'undef'); - } elsif ($v =~ /^0x/) { - push(@l, "\'".$v."\'"); - } else { - push(@l, $v.''); - } - } - return join(',', @l); -} - -($hz) = @ARGV; - -# Use this to generate the %canned_values structure -if ($hz eq '--can') { - shift(@ARGV); - @hzlist = sort {$a <=> $b} (@ARGV); - - print "# Precomputed values for systems without Math::BigInt\n"; - print "# Generated by:\n"; - print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; - print "\%canned_values = (\n"; - my $pf = "\t"; - foreach $hz (@hzlist) { - my @values = compute_values($hz); - print "$pf$hz => [\n"; - while (scalar(@values)) { - my $bit; - foreach $bit (32) { - my $m = shift(@values); - my $a = shift(@values); - my $s = shift(@values); - print "\t\t", perlvals($m,$a,$s), ",\n"; - } - my $n = shift(@values); - my $d = shift(@values); - print "\t\t", perlvals($n,$d), ",\n"; - } - print "\t]"; - $pf = ', '; - } - print "\n);\n"; -} else { - $hz += 0; # Force to number - if ($hz < 1) { - die "Usage: $0 HZ\n"; - } - - @val = @{$canned_values{$hz}}; - if (!defined(@val)) { - @val = compute_values($hz); - } - output($hz, @val); -} -exit 0; diff --git a/kernel/timer.c b/kernel/timer.c index 367d00858482..dbf7a78a1ef1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include <linux/kallsyms.h> #include <linux/irq_work.h> #include <linux/sched.h> +#include <linux/sched/sysctl.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -1351,7 +1352,6 @@ void update_process_times(int user_tick) account_process_tick(p, user_tick); run_local_timers(); rcu_check_callbacks(cpu, user_tick); - printk_tick(); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_run(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f..192473b22799 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -78,21 +81,6 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool -config EVENT_POWER_TRACING_DEPRECATED - depends on EVENT_TRACING - bool "Deprecated power event trace API, to be removed" - default y - help - Provides old power event types: - C-state/idle accounting events: - power:power_start - power:power_end - and old cpufreq accounting event: - power:power_frequency - This is for userspace compatibility - and will vanish after 5 kernel iterations, - namely 3.1. - config CONTEXT_SWITCH_TRACER bool @@ -250,6 +238,16 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -434,6 +432,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741c..9e5b8c272eec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); @@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { + struct blk_trace *bt = q->blk_trace; + + /* if control ever passes through here, it's a request based driver */ + if (unlikely(bt && !bt->rq_based)) + bt->rq_based = true; + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) +static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) { + struct request_queue *q; + struct blk_trace *bt; + + if (!bio->bi_bdev) + return; + + q = bdev_get_queue(bio->bi_bdev); + bt = q->blk_trace; + + /* + * Request based drivers will generate both rq and bio completions. + * Ignore bio ones. + */ + if (likely(!bt) || bt->rq_based) + return; + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); @@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index afd092de45b7..ab25b88aae56 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void) return cnt; } -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + int bit; + + bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); + if (bit < 0) return; - trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); /*see above*/ - }; - trace_recursion_clear(TRACE_GLOBAL_BIT); + } while_for_each_ftrace_op(op); + + trace_clear_recursion(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -221,10 +233,24 @@ static void update_global_ops(void) * registered callers. */ if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) + ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; - else + /* + * As we are calling the function directly. + * If it does not have recursion protection, + * the function_trace_op needs to be updated + * accordingly. + */ + if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + else + global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; + } else { func = ftrace_global_list_func; + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + } + /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { @@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -736,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; key = hash_long(ip, ftrace_profile_bits); @@ -745,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, n, hhd, node) { + hlist_for_each_entry_rcu(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1107,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - struct hlist_node *n; if (ftrace_hash_empty(hash)) return NULL; @@ -1119,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + hlist_for_each_entry_rcu(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1176,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash, static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct ftrace_func_entry *entry; int size = 1 << hash->size_bits; int i; @@ -1186,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) for (i = 0; i < size; i++) { hhd = &hash->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + hlist_for_each_entry_safe(entry, tn, hhd, hlist) free_hash_entry(hash, entry); } FTRACE_WARN_ON(hash->count); @@ -1249,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; - struct hlist_node *tp; int size; int ret; int i; @@ -1264,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { ret = add_hash_entry(new_hash, entry->ip); if (ret < 0) goto free_hash; @@ -1290,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; @@ -1336,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + hlist_for_each_entry_safe(entry, tn, hhd, hlist) { if (bits > 0) key = hash_long(entry->ip, bits); else @@ -2675,12 +2698,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 1; @@ -2875,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_func_probe *entry; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; key = hash_long(ip, FTRACE_HASH_BITS); @@ -2891,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, n, hhd, node) { + hlist_for_each_entry_rcu(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } @@ -3042,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { struct ftrace_func_probe *entry; - struct hlist_node *n, *tmp; + struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; int i, len = 0; @@ -3065,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; - hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + hlist_for_each_entry_safe(entry, tmp, hhd, node) { /* break up if statements for readability */ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) @@ -3970,35 +3992,49 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - switch (val) { - case MODULE_STATE_COMING: + if (val == MODULE_STATE_COMING) ftrace_init_module(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); - break; - case MODULE_STATE_GOING: + return 0; +} + +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_GOING) ftrace_release_mod(mod); - break; - } return 0; } #else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_enter(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = 0, +struct notifier_block ftrace_module_enter_nb = { + .notifier_call = ftrace_module_notify_enter, + .priority = INT_MAX, /* Run before anything that can use kprobes */ +}; + +struct notifier_block ftrace_module_exit_nb = { + .notifier_call = ftrace_module_notify_exit, + .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; extern unsigned long __start_mcount_loc[]; @@ -4032,9 +4068,13 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); + ret = register_module_notifier(&ftrace_module_enter_nb); if (ret) - pr_warning("Failed to register trace ftrace module notifier\n"); + pr_warning("Failed to register trace ftrace module enter notifier\n"); + + ret = register_module_notifier(&ftrace_module_exit_nb); + if (ret) + pr_warning("Failed to register trace ftrace module exit notifier\n"); set_ftrace_early_filters(); @@ -4090,14 +4130,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - op = rcu_dereference_raw(ftrace_control_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_control_list) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + int bit; if (function_trace_stop) return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) return; - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_clear_recursion(bit); } /* @@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * Archs are to support both the regs and ftrace_ops at the same time. * If they support ftrace_ops, it is assumed they support regs. * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORT_FTARCE_OPS. */ diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b223..1c71382b283d 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,8 +13,5 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcd..6989df2ba194 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ftrace_event.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> +#include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/uaccess.h> @@ -21,7 +23,6 @@ #include <linux/fs.h> #include <asm/local.h> -#include "trace.h" static void update_pages_handler(struct work_struct *work); @@ -177,7 +178,7 @@ void tracing_off_permanent(void) #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else @@ -185,6 +186,8 @@ void tracing_off_permanent(void) # define RB_ARCH_ALIGNMENT 8U #endif +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -333,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ - unsigned char data[]; /* data of buffer page */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; /* @@ -2432,41 +2435,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); - - WARN_ON_ONCE(1); -} + unsigned int val = this_cpu_read(current_context); + int bit; -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else @@ -3067,6 +3105,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); /** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -3425,7 +3481,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61e081b4ba11..c2e2c2310374 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> +#include <linux/sched/rt.h> #include "trace.h" #include "trace_output.h" @@ -249,7 +250,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; static struct tracer *trace_types __read_mostly; /* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; +static struct tracer *current_trace __read_mostly = &nop_trace; /* * trace_types_lock is used to protect the trace_types list. @@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; } + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) return; - } arch_spin_lock(&ftrace_max_lock); @@ -862,10 +864,13 @@ int register_tracer(struct tracer *type) current_trace = type; - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + type->allocated_snapshot = true; + } /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -881,10 +886,14 @@ int register_tracer(struct tracer *type) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + type->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + } printk(KERN_CONT "PASSED\n"); } @@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) struct ring_buffer *buffer = tr->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = ++__get_cpu_var(ftrace_stack_reserve); + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* * We don't need any atomic variables, just a barrier. * If an interrupt comes in, we don't care, because it would @@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, out: /* Again, don't let gcc optimize things here */ barrier(); - __get_cpu_var(ftrace_stack_reserve)--; + __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } @@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; static char *get_trace_buf(void) { struct trace_buffer_struct *percpu_buffer; - struct trace_buffer_struct *buffer; /* * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1548,7 @@ static char *get_trace_buf(void) if (!percpu_buffer) return NULL; - buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - - return buffer->buffer; + return this_cpu_ptr(&percpu_buffer->buffer[0]); } static int alloc_percpu_trace_buffer(void) @@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; - /* copy the tracer to avoid using a global lock all around */ + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); - atomic_inc(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return; + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) unsigned long total; const char *name = "preemption"; - if (type) - name = type->name; + name = type->name; get_total_entries(tr, &total, &entries); @@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { long cpu_file = (long) inode->i_private; struct trace_iterator *iter; @@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter->trace) goto fail; - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace && current_trace->print_max) + if (current_trace->print_max || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; + iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); iter->cpu_file = cpu_file; @@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping */ - tracing_stop(); + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop(); if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { @@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - /* reenable tracing if it was previously enabled */ - tracing_start(); + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start(); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -2899,6 +2921,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = 0; + trace_set_options(buf); *ppos += cnt; @@ -3012,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int r; mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); + r = sprintf(buf, "%s\n", current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3034,6 +3055,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) tr->data[cpu]->entries = val; } +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_array *tr, + struct trace_array *size_tr, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu]->entries, cpu); + if (ret < 0) + break; + tr->data[cpu]->entries = size_tr->data[cpu]->entries; + } + } else { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu_id]->entries, cpu_id); + if (ret == 0) + tr->data[cpu_id]->entries = + size_tr->data[cpu_id]->entries; + } + + return ret; +} + static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -3058,23 +3104,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r = 0; - - if (cpu == RING_BUFFER_ALL_CPUS) { - int i; - for_each_tracing_cpu(i) { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[i]->entries, - i); - if (r < 0) - break; - } - } else { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[cpu]->entries, - cpu); - } - + int r = resize_buffer_duplicate_size(&global_trace, + &global_trace, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3171,6 +3202,7 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; + bool had_max_tr; int ret = 0; mutex_lock(&trace_types_lock); @@ -3195,9 +3227,21 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); - if (current_trace && current_trace->reset) + if (current_trace->reset) current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + had_max_tr = current_trace->allocated_snapshot; + current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3205,24 +3249,19 @@ static int tracing_set_tracer(const char *buf) */ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; } destroy_trace_option_files(topts); - current_trace = &nop_trace; - topts = create_trace_option_files(t); - if (t->use_max_tr) { - int cpu; + if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(max_tr.buffer, - global_trace.data[cpu]->entries, - cpu); - if (ret < 0) - goto out; - max_tr.data[cpu]->entries = - global_trace.data[cpu]->entries; - } + ret = resize_buffer_duplicate_size(&max_tr, &global_trace, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; + t->allocated_snapshot = true; } if (t->init) { @@ -3330,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3448,7 +3486,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is enabled. + * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3456,7 +3494,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (tracing_is_enabled() && iter->pos) + if (!tracing_is_enabled() && iter->pos) break; } @@ -3471,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; ssize_t sret; /* return any leftover data */ @@ -3483,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); /* @@ -3642,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - static struct tracer *old_tracer; ssize_t ret; size_t rem; unsigned int i; @@ -3652,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); @@ -4031,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&global_trace); - if (max_tr.buffer) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&max_tr); mutex_unlock(&trace_types_lock); @@ -4048,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return single_open(file, tracing_clock_show, NULL); } +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret = 0; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (current_trace->allocated_snapshot) { + /* free spare buffer */ + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; + } + break; + case 1: + if (!current_trace->allocated_snapshot) { + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&max_tr, + &global_trace, RING_BUFFER_ALL_CPUS); + if (ret < 0) + break; + current_trace->allocated_snapshot = true; + } + + local_irq_disable(); + /* Now, we're going to swap */ + update_max_tr(&global_trace, current, smp_processor_id()); + local_irq_enable(); + break; + default: + if (current_trace->allocated_snapshot) + tracing_reset_online_cpus(&max_tr); + else + ret = -EINVAL; + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -4104,6 +4216,16 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_seek, + .release = tracing_release, +}; +#endif /* CONFIG_TRACER_SNAPSHOT */ + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -4271,13 +4393,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -ENOMEM; if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); ret = -EINVAL; goto out; } if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); if (len < PAGE_SIZE) { ret = -EINVAL; goto out; @@ -4410,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); + cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -4486,7 +4609,7 @@ struct dentry *tracing_init_dentry(void) static struct dentry *d_percpu; -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(void) { static int once; struct dentry *d_tracer; @@ -4813,10 +4936,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - if (val) + mutex_lock(&trace_types_lock); + if (val) { ring_buffer_record_on(buffer); - else + if (current_trace->start) + current_trace->start(tr); + } else { ring_buffer_record_off(buffer); + if (current_trace->stop) + current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -4895,6 +5025,11 @@ static __init int tracer_init_debugfs(void) &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); +#endif + create_trace_options_dir(); for_each_tracing_cpu(cpu) @@ -5003,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { @@ -5014,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = TRACE_PIPE_ALL_CPU; @@ -5162,7 +5294,7 @@ __init static int tracer_alloc_buffers(void) init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); - current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902c..57d7e5397d56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -287,20 +287,62 @@ struct tracer { struct tracer_flags *flags; bool print_max; bool use_max_tr; + bool allocated_snapshot; }; /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) - -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) -#define TRACE_CONTROL_BIT (1<<13) +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + + /* GLOBAL_BITs must be greater than FTRACE_BITs */ + TRACE_GLOBAL_BIT, + TRACE_GLOBAL_NMI_BIT, + TRACE_GLOBAL_IRQ_BIT, + TRACE_GLOBAL_SIRQ_BIT, + + /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, /* * Abuse of the trace_recursion. @@ -309,11 +351,77 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) + TRACE_IRQ_BIT, +}; + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) + +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} #define TRACE_PIPE_ALL_CPU -1 diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb..aa8f5f48dae6 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void) return clock; } +EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, @@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void) local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); + now = sched_clock_cpu(this_cpu); /* * If in an NMI context then dont risk lockups and return the * cpu_clock() time: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b946..57e9b284250c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab7..601152523326 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - /* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) - { struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; + int bit; int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + pc = preempt_count(); + preempt_disable_notrace(); - if (likely(disabled == 1)) { - pc = preempt_count(); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = tr->data[cpu]; + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - if (func_flags.val & TRACE_FUNC_OPT_STACK) register_ftrace_function(&trace_stack_ops); else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e..39ada66389cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); @@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 194d79602dc7..697e88d13907 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -739,12 +739,11 @@ static int task_state_char(unsigned long state) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - struct hlist_node *n; unsigned key; key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, &event_hash[key], node) { if (event->type == type) return event; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 933708677814..5c7e09d10d74 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9fe45fcefca0..75aa97fbe1a1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,8 +15,8 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/sched/rt.h> #include <trace/events/sched.h> - #include "trace.h" static struct trace_array *wakeup_trace; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a815..51c819c12c29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, * The ftrace infrastructure should provide the recursion * protection. If not, this will crash the kernel! */ - trace_selftest_recursion_cnt++; + if (trace_selftest_recursion_cnt++ > 10) + return; DYN_FTRACE_TEST_NAME(); } @@ -452,7 +453,6 @@ trace_selftest_function_recursion(void) char *func_name; int len; int ret; - int cnt; /* The previous test PASSED */ pr_cont("PASSED\n"); @@ -510,19 +510,10 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_recsafe_probe); - /* - * If arch supports all ftrace features, and no other task - * was on the list, we should be fine. - */ - if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) - cnt = 2; /* Should have recursed */ - else - cnt = 1; - ret = -1; - if (trace_selftest_recursion_cnt != cnt) { - pr_cont("*callback not called expected %d times (%d)* ", - cnt, trace_selftest_recursion_cnt); + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); goto out; } @@ -568,7 +559,7 @@ trace_selftest_function_regs(void) int ret; int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS supported = 1; #endif diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0c1b165778e5..42ca822fc701 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -33,7 +33,6 @@ static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); static DEFINE_MUTEX(stack_sysctl_mutex); @@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, { int cpu; - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - preempt_disable_notrace(); cpu = raw_smp_processor_id(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c2..7a809e321058 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ @@ -47,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name } #endif +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -77,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -130,7 +163,7 @@ end: return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -270,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; - int size; int syscall_nr; + int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_enter_syscalls)) @@ -305,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -313,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct ring_buffer *buffer; int syscall_nr; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_exit_syscalls)) @@ -337,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; @@ -356,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; @@ -371,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; @@ -390,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; @@ -459,7 +492,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; @@ -502,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) @@ -578,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9614db8b0f8c..8dad2a92dee9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -22,25 +22,27 @@ #include <linux/uaccess.h> #include <linux/uprobes.h> #include <linux/namei.h> +#include <linux/string.h> #include "trace_probe.h" #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct trace_uprobe_filter filter; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -63,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -91,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -252,27 +268,32 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; + inode = igrab(path.dentry->d_inode); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } + ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; - inode = igrab(path.dentry->d_inode); - argc -= 2; argv += 2; /* setup a probe */ if (!event) { - char *tail = strrchr(filename, '/'); + char *tail; char *ptr; - ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); - if (!ptr) { + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { ret = -ENOMEM; goto fail_address_parse; } - tail = ptr; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; @@ -356,7 +377,7 @@ fail_address_parse: if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } @@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -533,42 +554,43 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) { - struct uprobe_trace_consumer *utc; - int ret = 0; + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} - if (!tu->inode || tu->consumer) - return -EINTR; +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) +{ + int ret = 0; + + if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - kfree(utc); - return ret; - } + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; + tu->consumer.filter = filter; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) + tu->flags &= ~flag; - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); + } else { + done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; + } + write_unlock(&tu->filter.rwlock); + + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) { + list_del(&event->hw.tp_list); + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { + tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } + write_unlock(&tu->filter.rwlock); + + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + + return 0; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; @@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; + int ret = 0; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d96ba22dabfa..0c05a4592047 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, static struct tracepoint_entry *get_tracepoint(const char *name) { struct hlist_head *head; - struct hlist_node *node; struct tracepoint_entry *e; u32 hash = jhash(name, strlen(name), 0); head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { + hlist_for_each_entry(e, head, hlist) { if (!strcmp(name, e->name)) return e; } @@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name) static struct tracepoint_entry *add_tracepoint(const char *name) { struct hlist_head *head; - struct hlist_node *node; struct tracepoint_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { + hlist_for_each_entry(e, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE "tracepoint %s busy\n", name); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 625df0b44690..a1dd9a1b1327 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; struct timespec uptime, ts; + cputime_t utime, stime, utimescaled, stimescaled; u64 ac_etime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); @@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_ppid = pid_alive(tsk) ? task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_usecs(tsk->utime); - stats->ac_stime = cputime_to_usecs(tsk->stime); - stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); - stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); + + task_cputime(tsk, &utime, &stime); + stats->ac_utime = cputime_to_usecs(utime); + stats->ac_stime = cputime_to_usecs(stime); + + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + stats->ac_utimescaled = cputime_to_usecs(utimescaled); + stats->ac_stimescaled = cputime_to_usecs(stimescaled); + stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; @@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) #undef KB #undef MB -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) +static void __acct_update_integrals(struct task_struct *tsk, + cputime_t utime, cputime_t stime) { if (likely(tsk->mm)) { cputime_t time, dtime; @@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk) u64 delta; local_irq_save(flags); - time = tsk->stime + tsk->utime; + time = stime + utime; dtime = time - tsk->acct_timexpd; jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; @@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk) } /** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); + __acct_update_integrals(tsk, utime, stime); +} + +/** + * acct_account_cputime - update mm integral after cputime update + * @tsk: task_struct for accounting + */ +void acct_account_cputime(struct task_struct *tsk) +{ + __acct_update_integrals(tsk, tsk->utime, tsk->stime); +} + +/** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared */ diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 1744bb80f1fb..394f70b17162 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister); void fire_user_return_notifiers(void) { struct user_return_notifier *urn; - struct hlist_node *tmp1, *tmp2; + struct hlist_node *tmp2; struct hlist_head *head; head = &get_cpu_var(return_notifier_list); - hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) + hlist_for_each_entry_safe(urn, tmp2, head, link) urn->on_user_return(urn); put_cpu_var(return_notifier_list); } diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9ec..e81978e8c03b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/proc_fs.h> /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -46,11 +47,10 @@ struct user_namespace init_user_ns = { .count = 4294967295U, }, }, - .kref = { - .refcount = ATOMIC_INIT(3), - }, + .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -105,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up) static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; - struct hlist_node *h; - hlist_for_each_entry(user, h, hashent, uidhash_node) { + hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { atomic_inc(&user->__count); return user; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 456a6b9fba34..8b650837083e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/proc_fs.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> @@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) +{ + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + cred->securebits = SECUREBITS_DEFAULT; + cred->cap_inheritable = CAP_EMPTY_SET; + cred->cap_permitted = CAP_FULL_SET; + cred->cap_effective = CAP_FULL_SET; + cred->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(cred->request_key_auth); + cred->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + cred->user_ns = user_ns; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -39,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -52,40 +72,48 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - kref_init(&ns->kref); + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + + atomic_set(&ns->count, 1); + /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - new->securebits = SECUREBITS_DEFAULT; - new->cap_inheritable = CAP_EMPTY_SET; - new->cap_permitted = CAP_FULL_SET; - new->cap_effective = CAP_FULL_SET; - new->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* Leave the new->user_ns reference with the new user namespace. */ - /* Leave the reference to our user_ns with the new cred. */ - new->user_ns = ns; + set_cred_user_ns(new, ns); return 0; } -void free_user_ns(struct kref *kref) +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { - struct user_namespace *parent, *ns = - container_of(kref, struct user_namespace, kref); + struct cred *cred; - parent = ns->parent; - kmem_cache_free(user_ns_cachep, ns); - put_user_ns(parent); + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + +void free_user_ns(struct user_namespace *ns) +{ + struct user_namespace *parent; + + do { + parent = ns->parent; + proc_free_inum(ns->proc_inum); + kmem_cache_free(user_ns_cachep, ns); + ns = parent; + } while (atomic_dec_and_test(&parent->count)); } EXPORT_SYMBOL(free_user_ns); @@ -372,7 +400,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -393,7 +421,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -492,6 +520,42 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; +static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +{ + u32 upper_first, lower_first, upper_last, lower_last; + unsigned idx; + + upper_first = extent->first; + lower_first = extent->lower_first; + upper_last = upper_first + extent->count - 1; + lower_last = lower_first + extent->count - 1; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + u32 prev_upper_first, prev_lower_first; + u32 prev_upper_last, prev_lower_last; + struct uid_gid_extent *prev; + + prev = &new_map->extent[idx]; + + prev_upper_first = prev->first; + prev_lower_first = prev->lower_first; + prev_upper_last = prev_upper_first + prev->count - 1; + prev_lower_last = prev_lower_first + prev->count - 1; + + /* Does the upper range intersect a previous extent? */ + if ((prev_upper_first <= upper_last) && + (prev_upper_last >= upper_first)) + return true; + + /* Does the lower range intersect a previous extent? */ + if ((prev_lower_first <= lower_last) && + (prev_lower_last >= lower_first)) + return true; + } + return false; +} + + static DEFINE_MUTEX(id_map_mutex); static ssize_t map_write(struct file *file, const char __user *buf, @@ -504,7 +568,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent, *last = NULL; + struct uid_gid_extent *extent = NULL; unsigned long page = 0; char *kbuf, *pos, *next_line; ssize_t ret = -EINVAL; @@ -607,14 +671,11 @@ static ssize_t map_write(struct file *file, const char __user *buf, if ((extent->lower_first + extent->count) <= extent->lower_first) goto out; - /* For now only accept extents that are strictly in order */ - if (last && - (((last->first + last->count) > extent->first) || - ((last->lower_first + last->count) > extent->lower_first))) + /* Do the ranges in extent overlap any previous extents? */ + if (mappings_overlap(&new_map, extent)) goto out; new_map.nr_extents++; - last = extent; /* Fail if the file contains too many extents */ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && @@ -669,10 +730,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -681,10 +746,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } @@ -709,6 +778,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow mapping to your own filesystem ids */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); + if (uid_eq(uid, current_fsuid())) + return true; + } + else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (gid_eq(gid, current_fsgid())) + return true; + } + } + /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; @@ -722,6 +806,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } +static void *userns_get(struct task_struct *task) +{ + struct user_namespace *user_ns; + + rcu_read_lock(); + user_ns = get_user_ns(__task_cred(task)->user_ns); + rcu_read_unlock(); + + return user_ns; +} + +static void userns_put(void *ns) +{ + put_user_ns(ns); +} + +static int userns_install(struct nsproxy *nsproxy, void *ns) +{ + struct user_namespace *user_ns = ns; + struct cred *cred; + + /* Don't allow gaining capabilities by reentering + * the same user namespace. + */ + if (user_ns == current_user_ns()) + return -EINVAL; + + /* Threaded processes may not enter a different user namespace */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + put_user_ns(cred->user_ns); + set_cred_user_ns(cred, get_user_ns(user_ns)); + + return commit_creds(cred); +} + +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + +const struct proc_ns_operations userns_operations = { + .name = "user", + .type = CLONE_NEWUSER, + .get = userns_get, + .put = userns_put, + .install = userns_install, + .inum = userns_inum, +}; + static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3fd..a47fc5de3113 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -30,20 +30,27 @@ static struct uts_namespace *create_uts_ns(void) /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone - * Return NULL on error (failure to kmalloc), new ns otherwise + * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; } @@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(tsk, old_ns); + new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; @@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -102,19 +109,32 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *ns) +static int utsns_install(struct nsproxy *nsproxy, void *new) { + struct uts_namespace *ns = new; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) + return -EPERM; + get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 63da38c2d820..4f69f9a5e221 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -15,6 +15,8 @@ #include <linux/sysctl.h> #include <linux/wait.h> +#ifdef CONFIG_PROC_SYSCTL + static void *get_uts(ctl_table *table, int write) { char *which = table->data; @@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which) up_write(&uts_sem); } -#ifdef CONFIG_PROC_SYSCTL /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c8c21be11ab4..4a944676358e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/sysctl.h> #include <linux/smpboot.h> +#include <linux/sched/rt.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -31,6 +32,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_disabled; +static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -111,12 +113,12 @@ static int get_softlockup_thresh(void) * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ -static unsigned long get_timestamp(int this_cpu) +static unsigned long get_timestamp(void) { - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ + return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ } -static u64 get_sample_period(void) +static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -125,15 +127,13 @@ static u64 get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ static void __touch_watchdog(void) { - int this_cpu = smp_processor_id(); - - __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); + __this_cpu_write(watchdog_touch_ts, get_timestamp()); } void touch_softlockup_watchdog(void) @@ -194,7 +194,7 @@ static int is_hardlockup(void) static int is_softlockup(unsigned long touch_ts) { - unsigned long now = get_timestamp(smp_processor_id()); + unsigned long now = get_timestamp(); /* Warn about unreasonable delays: */ if (time_after(now, touch_ts + get_softlockup_thresh())) @@ -275,7 +275,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) wake_up_process(__this_cpu_read(softlockup_watchdog)); /* .. and repeat */ - hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); if (touch_ts == 0) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { @@ -343,6 +343,10 @@ static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + if (!watchdog_enabled) { kthread_park(current); return; @@ -351,12 +355,8 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* kick off the timer for the hardlockup detector */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; - /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); /* initialize timestamp */ @@ -368,9 +368,6 @@ static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - if (!watchdog_enabled) - return; - watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ @@ -386,7 +383,7 @@ static int watchdog_should_run(unsigned int cpu) /* * The watchdog thread function - touches the timestamp. * - * It only runs once every get_sample_period() seconds (4 seconds by + * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). @@ -519,6 +516,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, if (ret || !write) return ret; + set_sample_period(); if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else @@ -540,6 +538,7 @@ static struct smp_hotplug_thread watchdog_threads = { void __init lockup_detector_init(void) { + set_sample_period(); if (smpboot_register_percpu_thread(&watchdog_threads)) { pr_err("Failed to create watchdog threads, disabled\n"); watchdog_disabled = -ENODEV; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fbc6576a83c3..81f2457811eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,32 +41,31 @@ #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> +#include <linux/hashtable.h> -#include "workqueue_sched.h" +#include "workqueue_internal.h" enum { /* - * global_cwq flags + * worker_pool flags * - * A bound gcwq is either associated or disassociated with its CPU. + * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may - * be executing on any CPU. The gcwq behaves as an unbound one. + * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex of all pools on the gcwq to avoid changing binding - * state while create_worker() is in progress. + * assoc_mutex to avoid changing binding state while + * create_worker() is in progress. */ - GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 1, /* freeze in progress */ - - /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ + POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -79,11 +78,9 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | WORKER_CPU_INTENSIVE, - NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ + NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ - BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, - BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ @@ -111,48 +108,24 @@ enum { * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * - * L: gcwq->lock protected. Access with gcwq->lock held. + * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires gcwq->lock and - * should be done only from local cpu. Either disabling preemption - * on local cpu or grabbing gcwq->lock is enough for read access. - * If GCWQ_DISASSOCIATED is set, it's identical to L. + * X: During normal operation, modification requires pool->lock and should + * be done only from local cpu. Either disabling preemption on local + * cpu or grabbing pool->lock is enough for read access. If + * POOL_DISASSOCIATED is set, it's identical to L. * * F: wq->flush_mutex protected. * * W: workqueue_lock protected. */ -struct global_cwq; -struct worker_pool; - -/* - * The poor guys doing the actual heavy lifting. All on-duty workers - * are either serving the manager role, on idle list or on busy hash. - */ -struct worker { - /* on idle list while idle, on busy hash table while busy */ - union { - struct list_head entry; /* L: while idle */ - struct hlist_node hentry; /* L: while busy */ - }; - - struct work_struct *current_work; /* L: work being processed */ - struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ - struct list_head scheduled; /* L: scheduled works */ - struct task_struct *task; /* I: worker task */ - struct worker_pool *pool; /* I: the associated pool */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ - unsigned long last_active; /* L: last active timestamp */ - unsigned int flags; /* X: flags */ - int id; /* I: worker id */ - - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ -}; +/* struct worker is defined in workqueue_internal.h */ struct worker_pool { - struct global_cwq *gcwq; /* I: the owning gcwq */ + spinlock_t lock; /* the pool lock */ + unsigned int cpu; /* I: the associated cpu */ + int id; /* I: pool ID */ unsigned int flags; /* X: flags */ struct list_head worklist; /* L: list of pending works */ @@ -165,34 +138,28 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ - struct ida worker_ida; /* L: for worker IDs */ -}; - -/* - * Global per-cpu workqueue. There's one and only one for each cpu - * and all works are queued and processed here regardless of their - * target workqueues. - */ -struct global_cwq { - spinlock_t lock; /* the gcwq lock */ - unsigned int cpu; /* I: the associated cpu */ - unsigned int flags; /* L: GCWQ_* flags */ - - /* workers are chained either in busy_hash or pool idle_list */ - struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* workers are chained either in busy_hash or idle_list */ + DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - struct worker_pool pools[NR_WORKER_POOLS]; - /* normal and highpri pools */ + struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ + struct ida worker_ida; /* L: for worker IDs */ + + /* + * The current concurrency level. As it's likely to be accessed + * from other CPUs during try_to_wake_up(), put it in a separate + * cacheline. + */ + atomic_t nr_running ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp; /* - * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of - * work_struct->data are used for flags and thus cwqs need to be - * aligned at two's power of the number of flag bits. + * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS + * of work_struct->data are used for flags and the remaining high bits + * point to the pwq; thus, pwqs need to be aligned at two's power of the + * number of flag bits. */ -struct cpu_workqueue_struct { +struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ @@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t; struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ union { - struct cpu_workqueue_struct __percpu *pcpu; - struct cpu_workqueue_struct *single; + struct pool_workqueue __percpu *pcpu; + struct pool_workqueue *single; unsigned long v; - } cpu_wq; /* I: cwq's */ + } pool_wq; /* I: pwq's */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ int flush_color; /* F: current flush color */ - atomic_t nr_cwqs_to_flush; /* flush in progress */ + atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* F: first flusher */ struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ @@ -259,7 +226,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* W: drain in progress */ - int saved_max_active; /* W: saved cwq max_active */ + int saved_max_active; /* W: saved pwq max_active */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> -#define for_each_worker_pool(pool, gcwq) \ - for ((pool) = &(gcwq)->pools[0]; \ - (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) +#define for_each_std_worker_pool(pool, cpu) \ + for ((pool) = &std_worker_pools(cpu)[0]; \ + (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) -#define for_each_busy_worker(worker, i, pos, gcwq) \ - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ - hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) +#define for_each_busy_worker(worker, i, pool) \ + hash_for_each(pool->busy_hash, i, worker, hentry) -static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) { if (cpu < nr_cpu_ids) { if (sw & 1) { @@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, if (sw & 2) return WORK_CPU_UNBOUND; } - return WORK_CPU_NONE; + return WORK_CPU_END; } -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) +static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) { - return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); + return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); } /* * CPU iterators * - * An extra gcwq is defined for an invalid cpu number + * An extra cpu number is defined using an invalid cpu number * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to - * for_each_*_cpu() iterators but also considers the unbound gcwq. + * specific CPU. The following iterators are similar to for_each_*_cpu() + * iterators but also considers the unbound CPU. * - * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_pwq_cpu() : possible CPUs for bound workqueues, * WORK_CPU_UNBOUND for unbound workqueues */ -#define for_each_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) +#define for_each_wq_cpu(cpu) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_END; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) -#define for_each_online_gcwq_cpu(cpu) \ - for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) +#define for_each_online_wq_cpu(cpu) \ + for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_END; \ + (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) -#define for_each_cwq_cpu(cpu, wq) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_NONE; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) +#define for_each_pwq_cpu(cpu, wq) \ + for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_END; \ + (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -459,57 +425,69 @@ static LIST_HEAD(workqueues); static bool workqueue_freezing; /* W: have wqs started freezing? */ /* - * The almighty global cpu workqueues. nr_running is the only field - * which is expected to be used frequently by other cpus via - * try_to_wake_up(). Put it in a separate cacheline. + * The CPU and unbound standard worker pools. The unbound ones have + * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. */ -static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_std_worker_pools); +static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -/* - * Global cpu workqueue and nr_running counter for unbound gcwq. The - * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its - * workers have WORKER_UNBOUND set. - */ -static struct global_cwq unbound_global_cwq; -static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { - [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ -}; +/* idr of all pools */ +static DEFINE_MUTEX(worker_pool_idr_mutex); +static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static int worker_pool_pri(struct worker_pool *pool) +static struct worker_pool *std_worker_pools(int cpu) { - return pool - pool->gcwq->pools; + if (cpu != WORK_CPU_UNBOUND) + return per_cpu(cpu_std_worker_pools, cpu); + else + return unbound_std_worker_pools; } -static struct global_cwq *get_gcwq(unsigned int cpu) +static int std_worker_pool_pri(struct worker_pool *pool) { - if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(global_cwq, cpu); - else - return &unbound_global_cwq; + return pool - std_worker_pools(pool->cpu); } -static atomic_t *get_pool_nr_running(struct worker_pool *pool) +/* allocate ID and assign it to @pool */ +static int worker_pool_assign_id(struct worker_pool *pool) { - int cpu = pool->gcwq->cpu; - int idx = worker_pool_pri(pool); + int ret; - if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(pool_nr_running, cpu)[idx]; - else - return &unbound_pool_nr_running[idx]; + mutex_lock(&worker_pool_idr_mutex); + idr_pre_get(&worker_pool_idr, GFP_KERNEL); + ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + mutex_unlock(&worker_pool_idr_mutex); + + return ret; } -static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, - struct workqueue_struct *wq) +/* + * Lookup worker_pool by id. The idr currently is built during boot and + * never modified. Don't worry about locking for now. + */ +static struct worker_pool *worker_pool_by_id(int pool_id) +{ + return idr_find(&worker_pool_idr, pool_id); +} + +static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) +{ + struct worker_pool *pools = std_worker_pools(cpu); + + return &pools[highpri]; +} + +static struct pool_workqueue *get_pwq(unsigned int cpu, + struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); + return per_cpu_ptr(wq->pool_wq.pcpu, cpu); } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->cpu_wq.single; + return wq->pool_wq.single; return NULL; } @@ -530,19 +508,19 @@ static int work_next_color(int color) } /* - * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data - * contain the pointer to the queued cwq. Once execution starts, the flag - * is cleared and the high bits contain OFFQ flags and CPU number. + * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data + * contain the pointer to the queued pwq. Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and pool ID. * - * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the cwq, cpu or clear + * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the pwq, pool or clear * work->data. These functions should only be called while the work is * owned - ie. while the PENDING bit is set. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to - * a work. gcwq is available once the work has been queued anywhere after - * initialization until it is sync canceled. cwq is available only while - * the work item is queued. + * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq + * corresponding to a work. Pool is available once the work has been + * queued anywhere after initialization until it is sync canceled. pwq is + * available only while the work item is queued. * * %WORK_OFFQ_CANCELING is used to mark a work item which is being * canceled. While being canceled, a work item may have its PENDING set @@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data, atomic_long_set(&work->data, data | flags | work_static(work)); } -static void set_work_cwq(struct work_struct *work, - struct cpu_workqueue_struct *cwq, +static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long extra_flags) { - set_work_data(work, (unsigned long)cwq, - WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); + set_work_data(work, (unsigned long)pwq, + WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); } -static void set_work_cpu_and_clear_pending(struct work_struct *work, - unsigned int cpu) +static void set_work_pool_and_keep_pending(struct work_struct *work, + int pool_id) +{ + set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, + WORK_STRUCT_PENDING); +} + +static void set_work_pool_and_clear_pending(struct work_struct *work, + int pool_id) { /* * The following wmb is paired with the implied mb in @@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work, * owner. */ smp_wmb(); - set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); + set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { - smp_wmb(); /* see set_work_cpu_and_clear_pending() */ - set_work_data(work, WORK_STRUCT_NO_CPU, 0); + smp_wmb(); /* see set_work_pool_and_clear_pending() */ + set_work_data(work, WORK_STRUCT_NO_POOL, 0); } -static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - if (data & WORK_STRUCT_CWQ) + if (data & WORK_STRUCT_PWQ) return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); else return NULL; } -static struct global_cwq *get_work_gcwq(struct work_struct *work) +/** + * get_work_pool - return the worker_pool a given work was associated with + * @work: the work item of interest + * + * Return the worker_pool @work was last associated with. %NULL if none. + */ +static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - unsigned int cpu; + struct worker_pool *pool; + int pool_id; - if (data & WORK_STRUCT_CWQ) - return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; + if (data & WORK_STRUCT_PWQ) + return ((struct pool_workqueue *) + (data & WORK_STRUCT_WQ_DATA_MASK))->pool; - cpu = data >> WORK_OFFQ_CPU_SHIFT; - if (cpu == WORK_CPU_NONE) + pool_id = data >> WORK_OFFQ_POOL_SHIFT; + if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); - return get_gcwq(cpu); + pool = worker_pool_by_id(pool_id); + WARN_ON_ONCE(!pool); + return pool; +} + +/** + * get_work_pool_id - return the worker pool ID a given work is associated with + * @work: the work item of interest + * + * Return the worker_pool ID @work was last associated with. + * %WORK_OFFQ_POOL_NONE if none. + */ +static int get_work_pool_id(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_PWQ) + return ((struct pool_workqueue *) + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + + return data >> WORK_OFFQ_POOL_SHIFT; } static void mark_work_canceling(struct work_struct *work) { - struct global_cwq *gcwq = get_work_gcwq(work); - unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + unsigned long pool_id = get_work_pool_id(work); - set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, - WORK_STRUCT_PENDING); + pool_id <<= WORK_OFFQ_POOL_SHIFT; + set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); } static bool work_is_canceling(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); + return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); } /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that - * they're being called with gcwq->lock held. + * they're being called with pool->lock held. */ static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_pool_nr_running(pool)); + return !atomic_read(&pool->nr_running); } /* @@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool) * running workers. * * Note that, because unbound workers never contribute to nr_running, this - * function will always return %true for unbound gcwq as long as the + * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) @@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_pool_nr_running(pool); - - return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; + return !list_empty(&pool->worklist) && + atomic_read(&pool->nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ @@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool) * Wake up the first idle worker of @pool. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { @@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); - atomic_inc(get_pool_nr_running(worker->pool)); + WARN_ON_ONCE(worker->pool->cpu != cpu); + atomic_inc(&worker->pool->nr_running); } } @@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct worker_pool *pool = worker->pool; - atomic_t *nr_running = get_pool_nr_running(pool); + struct worker_pool *pool; + /* + * Rescuers, which may not have all the fields set up like normal + * workers, also reach here, let's not access anything before + * checking NOT_RUNNING. + */ if (worker->flags & WORKER_NOT_RUNNING) return NULL; + pool = worker->pool; + /* this can only happen on the local cpu */ BUG_ON(cpu != raw_smp_processor_id()); @@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * NOT_RUNNING is clear. This means that we're bound to and * running on the local cpu w/ rq lock held and preemption * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without gcwq + * manipulating idle_list, so dereferencing idle_list without pool * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) + if (atomic_dec_and_test(&pool->nr_running) && + !list_empty(&pool->worklist)) to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * woken up. * * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) @@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_pool_nr_running(pool); - if (wakeup) { - if (atomic_dec_and_test(nr_running) && + if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) wake_up_worker(pool); } else - atomic_dec(nr_running); + atomic_dec(&pool->nr_running); } worker->flags |= flags; @@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: - * spin_lock_irq(gcwq->lock) + * spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { @@ -855,87 +868,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_pool_nr_running(pool)); + atomic_inc(&pool->nr_running); } /** - * busy_worker_head - return the busy hash head for a work - * @gcwq: gcwq of interest - * @work: work to be hashed - * - * Return hash head of @gcwq for @work. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to the hash head. - */ -static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, - struct work_struct *work) -{ - const int base_shift = ilog2(sizeof(struct work_struct)); - unsigned long v = (unsigned long)work; - - /* simple shift and fold hash, do we need something better? */ - v >>= base_shift; - v += v >> BUSY_WORKER_HASH_ORDER; - v &= BUSY_WORKER_HASH_MASK; - - return &gcwq->busy_hash[v]; -} - -/** - * __find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @bwh: hash head as returned by busy_worker_head() + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest * @work: work to find worker for * - * Find a worker which is executing @work on @gcwq. @bwh should be - * the hash head obtained by calling busy_worker_head() with the same - * work. + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address, work function and workqueue + * to avoid false positives. Note that this isn't complete as one may + * construct a work function which can introduce dependency onto itself + * through a recycled work item. Well, if somebody wants to shoot oneself + * in the foot that badly, there's only so much we can do, and if such + * deadlock actually occurs, it should be easy to locate the culprit work + * function. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). * * RETURNS: * Pointer to worker which is executing @work if found, NULL * otherwise. */ -static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, - struct hlist_head *bwh, - struct work_struct *work) +static struct worker *find_worker_executing_work(struct worker_pool *pool, + struct work_struct *work) { struct worker *worker; - struct hlist_node *tmp; - hlist_for_each_entry(worker, tmp, bwh, hentry) - if (worker->current_work == work) + hash_for_each_possible(pool->busy_hash, worker, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) return worker; - return NULL; -} -/** - * find_worker_executing_work - find worker which is executing a work - * @gcwq: gcwq of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @gcwq. This function is - * identical to __find_worker_executing_work() except that this - * function calculates @bwh itself. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct global_cwq *gcwq, - struct work_struct *work) -{ - return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), - work); + return NULL; } /** @@ -953,7 +934,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, * nested inside outer list_for_each_entry_safe(). * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) @@ -979,67 +960,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } -static void cwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_delayed_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); + move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; + pwq->nr_active++; } -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +static void pwq_activate_first_delayed(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->delayed_works, struct work_struct, entry); - cwq_activate_delayed_work(work); + pwq_activate_delayed_work(work); } /** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest + * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight + * @pwq: pwq of interest * @color: color of work which left the queue * * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. + * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { /* ignore uncolored works */ if (color == WORK_NO_COLOR) return; - cwq->nr_in_flight[color]--; + pwq->nr_in_flight[color]--; - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { + pwq->nr_active--; + if (!list_empty(&pwq->delayed_works)) { /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); } /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) + if (likely(pwq->flush_color != color)) return; /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) + if (pwq->nr_in_flight[color]) return; - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; + /* this pwq is done, clear flush_color */ + pwq->flush_color = -1; /* - * If this was the last cwq, wake up the first flusher. It + * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); + if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) + complete(&pwq->wq->first_flusher->done); } /** @@ -1070,7 +1051,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { - struct global_cwq *gcwq; + struct worker_pool *pool; + struct pool_workqueue *pwq; local_irq_save(*flags); @@ -1095,41 +1077,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - gcwq = get_work_gcwq(work); - if (!gcwq) + pool = get_work_pool(work); + if (!pool) goto fail; - spin_lock(&gcwq->lock); - if (!list_empty(&work->entry)) { + spin_lock(&pool->lock); + /* + * work->data is guaranteed to point to pwq only while the work + * item is queued on pwq->wq, and both updating work->data to point + * to pwq on queueing and to pool on dequeueing are done under + * pwq->pool->lock. This in turn guarantees that, if work->data + * points to pwq which is associated with a locked pool, the work + * item is currently queued on that pool. + */ + pwq = get_work_pwq(work); + if (pwq && pwq->pool == pool) { + debug_work_deactivate(work); + /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). + * A delayed work item cannot be grabbed directly because + * it might have linked NO_COLOR work items which, if left + * on the delayed_list, will confuse pwq->nr_active + * management later on and cause stall. Make sure the work + * item is activated before grabbing. */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + pwq_activate_delayed_work(work); - /* - * A delayed work item cannot be grabbed directly - * because it might have linked NO_COLOR work items - * which, if left on the delayed_list, will confuse - * cwq->nr_active management later on and cause - * stall. Make sure the work item is activated - * before grabbing. - */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - cwq_activate_delayed_work(work); + list_del_init(&work->entry); + pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work)); + /* work->data points to pwq iff queued, point to pool */ + set_work_pool_and_keep_pending(work, pool->id); - spin_unlock(&gcwq->lock); - return 1; - } + spin_unlock(&pool->lock); + return 1; } - spin_unlock(&gcwq->lock); + spin_unlock(&pool->lock); fail: local_irq_restore(*flags); if (work_is_canceling(work)) @@ -1139,33 +1123,25 @@ fail: } /** - * insert_work - insert a work into gcwq - * @cwq: cwq @work belongs to + * insert_work - insert a work into a pool + * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * - * Insert @work which belongs to @cwq into @gcwq after @head. - * @extra_flags is or'd to work_struct flags. + * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to + * work_struct flags. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ -static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head, - unsigned int extra_flags) +static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, + struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = cwq->pool; + struct worker_pool *pool = pwq->pool; /* we own @work, set data and link */ - set_work_cwq(work, cwq, extra_flags); - - /* - * Ensure that we get the right work->data if we see the - * result of list_add() below, see try_to_grab_pending(). - */ - smp_wmb(); - + set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); /* @@ -1181,41 +1157,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq, /* * Test whether @work is being queued from another work executing on the - * same workqueue. This is rather expensive and should only be used from - * cold paths. + * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { - unsigned long flags; - unsigned int cpu; - - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; - struct hlist_node *pos; - int i; + struct worker *worker; - spin_lock_irqsave(&gcwq->lock, flags); - for_each_busy_worker(worker, i, pos, gcwq) { - if (worker->task != current) - continue; - spin_unlock_irqrestore(&gcwq->lock, flags); - /* - * I'm @worker, no locking necessary. See if @work - * is headed to the same workqueue. - */ - return worker->current_cwq->wq == wq; - } - spin_unlock_irqrestore(&gcwq->lock, flags); - } - return false; + worker = current_wq_worker(); + /* + * Return %true iff I'm a worker execuing a work item on @wq. If + * I'm @worker, it's safe to dereference it without locking. + */ + return worker && worker->current_pwq->wq == wq; } static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { - struct global_cwq *gcwq; - struct cpu_workqueue_struct *cwq; + struct pool_workqueue *pwq; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1235,9 +1194,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine gcwq to use */ + /* determine the pwq to use */ if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *last_gcwq; + struct worker_pool *last_pool; if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); @@ -1248,55 +1207,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - gcwq = get_gcwq(cpu); - last_gcwq = get_work_gcwq(work); + pwq = get_pwq(cpu, wq); + last_pool = get_work_pool(work); - if (last_gcwq && last_gcwq != gcwq) { + if (last_pool && last_pool != pwq->pool) { struct worker *worker; - spin_lock(&last_gcwq->lock); + spin_lock(&last_pool->lock); - worker = find_worker_executing_work(last_gcwq, work); + worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_cwq->wq == wq) - gcwq = last_gcwq; - else { + if (worker && worker->current_pwq->wq == wq) { + pwq = get_pwq(last_pool->cpu, wq); + } else { /* meh... not running there, queue here */ - spin_unlock(&last_gcwq->lock); - spin_lock(&gcwq->lock); + spin_unlock(&last_pool->lock); + spin_lock(&pwq->pool->lock); } } else { - spin_lock(&gcwq->lock); + spin_lock(&pwq->pool->lock); } } else { - gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock(&gcwq->lock); + pwq = get_pwq(WORK_CPU_UNBOUND, wq); + spin_lock(&pwq->pool->lock); } - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(req_cpu, cwq, work); + /* pwq determined, queue */ + trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&gcwq->lock); + spin_unlock(&pwq->pool->lock); return; } - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); + pwq->nr_in_flight[pwq->work_color]++; + work_flags = work_color_to_flags(pwq->work_color); - if (likely(cwq->nr_active < cwq->max_active)) { + if (likely(pwq->nr_active < pwq->max_active)) { trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = &cwq->pool->worklist; + pwq->nr_active++; + worklist = &pwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; + worklist = &pwq->delayed_works; } - insert_work(cwq, work, worklist, work_flags); + insert_work(pwq, work, worklist, work_flags); - spin_unlock(&gcwq->lock); + spin_unlock(&pwq->pool->lock); } /** @@ -1347,19 +1305,17 @@ EXPORT_SYMBOL_GPL(queue_work); void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); /* should have been called from irqsafe timer with irq already off */ - __queue_work(dwork->cpu, cwq->wq, &dwork->work); + __queue_work(dwork->cpu, dwork->wq, &dwork->work); } -EXPORT_SYMBOL_GPL(delayed_work_timer_fn); +EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - unsigned int lcpu; WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); @@ -1379,30 +1335,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); - /* - * This stores cwq for the moment, for the timer_fn. Note that the - * work's gcwq is preserved to allow reentrance detection for - * delayed works. - */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); - - /* - * If we cannot get the last gcwq from @work directly, - * select the last CPU such that it avoids unnecessarily - * triggering non-reentrancy check in __queue_work(). - */ - lcpu = cpu; - if (gcwq) - lcpu = gcwq->cpu; - if (lcpu == WORK_CPU_UNBOUND) - lcpu = raw_smp_processor_id(); - } else { - lcpu = WORK_CPU_UNBOUND; - } - - set_work_cwq(work, get_cwq(lcpu, wq), 0); - + dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; @@ -1519,12 +1452,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work); * necessary. * * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1542,14 +1474,14 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because gcwq_unbind_fn() releases - * gcwq->lock between setting %WORKER_UNBOUND and zapping + * Sanity check nr_running. Because wq_unbind_fn() releases + * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. */ - WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && + WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && - atomic_read(get_pool_nr_running(pool))); + atomic_read(&pool->nr_running)); } /** @@ -1559,7 +1491,7 @@ static void worker_enter_idle(struct worker *worker) * @worker is leaving idle state. Update stats. * * LOCKING: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { @@ -1572,7 +1504,7 @@ static void worker_leave_idle(struct worker *worker) } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool * @worker: self * * Works which are scheduled while the cpu is online must at least be @@ -1584,27 +1516,27 @@ static void worker_leave_idle(struct worker *worker) * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and gcwq may be + * verbatim as it's best effort and blocking and pool may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies the - * binding against %GCWQ_DISASSOCIATED which is set during + * This function tries set_cpus_allowed() and locks pool and verifies the + * binding against %POOL_DISASSOCIATED which is set during * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker * enters idle state or fetches works without dropping lock, it can * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: - * Might sleep. Called without any lock but returns with gcwq->lock + * Might sleep. Called without any lock but returns with pool->lock * held. * * RETURNS: - * %true if the associated gcwq is online (@worker is successfully + * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ static bool worker_maybe_bind_and_lock(struct worker *worker) -__acquires(&gcwq->lock) +__acquires(&pool->lock) { - struct global_cwq *gcwq = worker->pool->gcwq; + struct worker_pool *pool = worker->pool; struct task_struct *task = worker->task; while (true) { @@ -1612,19 +1544,19 @@ __acquires(&gcwq->lock) * The following call may fail, succeed or succeed * without actually migrating the task to the cpu if * it races with cpu hotunplug operation. Verify - * against GCWQ_DISASSOCIATED. + * against POOL_DISASSOCIATED. */ - if (!(gcwq->flags & GCWQ_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + if (!(pool->flags & POOL_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); - spin_lock_irq(&gcwq->lock); - if (gcwq->flags & GCWQ_DISASSOCIATED) + spin_lock_irq(&pool->lock); + if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == gcwq->cpu && + if (task_cpu(task) == pool->cpu && cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(gcwq->cpu))) + get_cpu_mask(pool->cpu))) return true; - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* * We've raced with CPU hot[un]plug. Give it a breather @@ -1643,15 +1575,13 @@ __acquires(&gcwq->lock) */ static void idle_worker_rebind(struct worker *worker) { - struct global_cwq *gcwq = worker->pool->gcwq; - /* CPU may go down again inbetween, clear UNBOUND only on success */ if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_UNBOUND); /* rebind complete, become available again */ list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&worker->pool->lock); } /* @@ -1663,19 +1593,18 @@ static void idle_worker_rebind(struct worker *worker) static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_UNBOUND); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&worker->pool->lock); } /** - * rebind_workers - rebind all workers of a gcwq to the associated CPU - * @gcwq: gcwq of interest + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest * - * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding * is different for idle and busy ones. * * Idle ones will be removed from the idle_list and woken up. They will @@ -1693,38 +1622,31 @@ static void busy_worker_rebind_fn(struct work_struct *work) * including the manager will not appear on @idle_list until rebind is * complete, making local wake-ups safe. */ -static void rebind_workers(struct global_cwq *gcwq) +static void rebind_workers(struct worker_pool *pool) { - struct worker_pool *pool; struct worker *worker, *n; - struct hlist_node *pos; int i; - lockdep_assert_held(&gcwq->lock); - - for_each_worker_pool(pool, gcwq) - lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->lock); /* dequeue and kick idle ones */ - for_each_worker_pool(pool, gcwq) { - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list - * until rebind is complete to avoid receiving - * premature local wake-ups. - */ - list_del_init(&worker->entry); + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list until rebind + * is complete to avoid receiving premature local wake-ups. + */ + list_del_init(&worker->entry); - /* - * worker_thread() will see the above dequeuing - * and call idle_worker_rebind(). - */ - wake_up_process(worker->task); - } + /* + * worker_thread() will see the above dequeuing and call + * idle_worker_rebind(). + */ + wake_up_process(worker->task); } /* rebind busy workers */ - for_each_busy_worker(worker, i, pos, gcwq) { + for_each_busy_worker(worker, i, pool) { struct work_struct *rebind_work = &worker->rebind_work; struct workqueue_struct *wq; @@ -1736,16 +1658,16 @@ static void rebind_workers(struct global_cwq *gcwq) /* * wq doesn't really matter but let's keep @worker->pool - * and @cwq->pool consistent for sanity. + * and @pwq->pool consistent for sanity. */ - if (worker_pool_pri(worker->pool)) + if (std_worker_pool_pri(worker->pool)) wq = system_highpri_wq; else wq = system_wq; - insert_work(get_cwq(gcwq->cpu, wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); + insert_work(get_pwq(pool->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } } @@ -1780,19 +1702,18 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - struct global_cwq *gcwq = pool->gcwq; - const char *pri = worker_pool_pri(pool) ? "H" : ""; + const char *pri = std_worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); worker = alloc_worker(); if (!worker) @@ -1801,30 +1722,30 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (gcwq->cpu != WORK_CPU_UNBOUND) + if (pool->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(gcwq->cpu), - "kworker/%u:%d%s", gcwq->cpu, id, pri); + worker, cpu_to_node(pool->cpu), + "kworker/%u:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d%s", id, pri); if (IS_ERR(worker->task)) goto fail; - if (worker_pool_pri(pool)) + if (std_worker_pool_pri(pool)) set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); /* * Determine CPU binding of the new worker depending on - * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * %POOL_DISASSOCIATED. The caller is responsible for ensuring the * flag remains stable across this function. See the comments * above the flag definition for details. * * As an unbound worker may later become a regular one if CPU comes * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { - kthread_bind(worker->task, gcwq->cpu); + if (!(pool->flags & POOL_DISASSOCIATED)) { + kthread_bind(worker->task, pool->cpu); } else { worker->task->flags |= PF_THREAD_BOUND; worker->flags |= WORKER_UNBOUND; @@ -1833,9 +1754,9 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: if (id >= 0) { - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); ida_remove(&pool->worker_ida, id); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } kfree(worker); return NULL; @@ -1845,10 +1766,10 @@ fail: * start_worker - start a newly created worker * @worker: worker to start * - * Make the gcwq aware of @worker and start it. + * Make the pool aware of @worker and start it. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ static void start_worker(struct worker *worker) { @@ -1862,15 +1783,14 @@ static void start_worker(struct worker *worker) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @gcwq stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed. */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1885,21 +1805,20 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; - struct global_cwq *gcwq = pool->gcwq; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (too_many_workers(pool)) { struct worker *worker; @@ -1918,20 +1837,20 @@ static void idle_worker_timeout(unsigned long __pool) } } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } static bool send_mayday(struct work_struct *work) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct workqueue_struct *wq = cwq->wq; + struct pool_workqueue *pwq = get_work_pwq(work); + struct workqueue_struct *wq = pwq->wq; unsigned int cpu; if (!(wq->flags & WQ_RESCUER)) return false; /* mayday mayday mayday */ - cpu = cwq->pool->gcwq->cpu; + cpu = pwq->pool->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -1940,13 +1859,12 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; - struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1959,7 +1877,7 @@ static void gcwq_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -1978,24 +1896,22 @@ static void gcwq_mayday_timeout(unsigned long __pool) * may_start_working() true. * * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock) { - struct global_cwq *gcwq = pool->gcwq; - if (!need_to_create_worker(pool)) return false; restart: - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); @@ -2006,7 +1922,7 @@ restart: worker = create_worker(pool); if (worker) { del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); BUG_ON(need_to_create_worker(pool)); return true; @@ -2023,7 +1939,7 @@ restart: } del_timer_sync(&pool->mayday_timer); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (need_to_create_worker(pool)) goto restart; return true; @@ -2037,11 +1953,11 @@ restart: * IDLE_WORKER_TIMEOUT. * * LOCKING: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true + * false if no action was taken and pool->lock stayed locked, true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2071,21 +1987,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * manage_workers - manage worker pool * @worker: self * - * Assume the manager role and manage gcwq worker pool @worker belongs + * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per - * gcwq. The exclusion is handled automatically by this function. + * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false * and may_start_working() is true. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * RETURNS: - * false if no action was taken and gcwq->lock stayed locked, true if - * some action was taken. + * spin_lock_irq(pool->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. */ static bool manage_workers(struct worker *worker) { @@ -2107,20 +2023,20 @@ static bool manage_workers(struct worker *worker) * manager against CPU hotplug. * * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @gcwq->lock. + * progress. trylock first without dropping @pool->lock. */ if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { - spin_unlock_irq(&pool->gcwq->lock); + spin_unlock_irq(&pool->lock); mutex_lock(&pool->assoc_mutex); /* * CPU hotplug could have happened while we were waiting * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and - * @gcwq's state and ours could have deviated. + * @pool's state and ours could have deviated. * * As hotplug is now excluded via assoc_mutex, we can * simply try to bind. It will succeed or fail depending - * on @gcwq's current state. Try it and adjust + * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. */ if (worker_maybe_bind_and_lock(worker)) @@ -2157,18 +2073,15 @@ static bool manage_workers(struct worker *worker) * call this function to process a work. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) +__releases(&pool->lock) +__acquires(&pool->lock) { - struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; - struct hlist_head *bwh = busy_worker_head(gcwq, work); - bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; - work_func_t f = work->func; + bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; int work_color; struct worker *collision; #ifdef CONFIG_LOCKDEP @@ -2186,11 +2099,11 @@ __acquires(&gcwq->lock) /* * Ensure we're on the correct CPU. DISASSOCIATED test is * necessary to avoid spurious warnings from rescuers servicing the - * unbound or a disassociated gcwq. + * unbound or a disassociated pool. */ WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && - !(gcwq->flags & GCWQ_DISASSOCIATED) && - raw_smp_processor_id() != gcwq->cpu); + !(pool->flags & POOL_DISASSOCIATED) && + raw_smp_processor_id() != pool->cpu); /* * A single work shouldn't be executed concurrently by @@ -2198,7 +2111,7 @@ __acquires(&gcwq->lock) * already processing the work. If so, defer the work to the * currently executing one. */ - collision = __find_worker_executing_work(gcwq, bwh, work); + collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, NULL); return; @@ -2206,9 +2119,10 @@ __acquires(&gcwq->lock) /* claim and dequeue */ debug_work_deactivate(work); - hlist_add_head(&worker->hentry, bwh); + hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; - worker->current_cwq = cwq; + worker->current_func = work->func; + worker->current_pwq = pwq; work_color = get_work_color(work); list_del_init(&work->entry); @@ -2221,53 +2135,55 @@ __acquires(&gcwq->lock) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); /* - * Unbound gcwq isn't concurrency managed and work items should be + * Unbound pool isn't concurrency managed and work items should be * executed ASAP. Wake up another worker if necessary. */ if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) wake_up_worker(pool); /* - * Record the last CPU and clear PENDING which should be the last - * update to @work. Also, do this inside @gcwq->lock so that + * Record the last pool and clear PENDING which should be the last + * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ - set_work_cpu_and_clear_pending(work, gcwq->cpu); + set_work_pool_and_clear_pending(work, pool->id); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); - lock_map_acquire_read(&cwq->wq->lockdep_map); + lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); - f(work); + worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. */ trace_workqueue_execute_end(work); lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" " last function: %pf\n", - current->comm, preempt_count(), task_pid_nr(current), f); + current->comm, preempt_count(), task_pid_nr(current), + worker->current_func); debug_show_held_locks(current); dump_stack(); } - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* we're done with it, release */ - hlist_del_init(&worker->hentry); + hash_del(&worker->hentry); worker->current_work = NULL; - worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color); + worker->current_func = NULL; + worker->current_pwq = NULL; + pwq_dec_nr_in_flight(pwq, work_color); } /** @@ -2279,7 +2195,7 @@ __acquires(&gcwq->lock) * fetches a work from the top and executes it. * * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) @@ -2295,8 +2211,8 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The gcwq worker thread function. There's a single dynamic pool of - * these per each cpu. These workers process all works regardless of + * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools + * of these per each cpu. These workers process all works regardless of * their specific target workqueue. The only exception is works which * belong to workqueues with a rescuer which will be explained in * rescuer_thread(). @@ -2305,16 +2221,15 @@ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); /* we are off idle list if destruction or rebind is requested */ if (unlikely(list_empty(&worker->entry))) { - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); /* if DIE is set, destruction is requested */ if (worker->flags & WORKER_DIE) { @@ -2373,52 +2288,59 @@ sleep: goto recheck; /* - * gcwq->lock is held and there's no work to process and no - * need to manage, sleep. Workers are woken up only while - * holding gcwq->lock or from local cpu, so setting the - * current state before releasing gcwq->lock is enough to - * prevent losing any event. + * pool->lock is held and there's no work to process and no need to + * manage, sleep. Workers are woken up only while holding + * pool->lock or from local cpu, so setting the current state + * before releasing pool->lock is enough to prevent losing any + * event. */ worker_enter_idle(worker); __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } /** * rescuer_thread - the rescuer thread function - * @__wq: the associated workqueue + * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_RESCUER set. * - * Regular work processing on a gcwq may block trying to create a new + * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * - * When such condition is possible, the gcwq summons rescuers of all - * workqueues which have works queued on the gcwq and let them process + * When such condition is possible, the pool summons rescuers of all + * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. */ -static int rescuer_thread(void *__wq) +static int rescuer_thread(void *__rescuer) { - struct workqueue_struct *wq = __wq; - struct worker *rescuer = wq->rescuer; + struct worker *rescuer = __rescuer; + struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; bool is_unbound = wq->flags & WQ_UNBOUND; unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); + + /* + * Mark rescuer as worker too. As WORKER_PREP is never cleared, it + * doesn't participate in concurrency management. + */ + rescuer->task->flags |= PF_WQ_WORKER; repeat: set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); + rescuer->task->flags &= ~PF_WQ_WORKER; return 0; } @@ -2428,9 +2350,8 @@ repeat: */ for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct worker_pool *pool = cwq->pool; - struct global_cwq *gcwq = pool->gcwq; + struct pool_workqueue *pwq = get_pwq(tcpu, wq); + struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); @@ -2446,22 +2367,24 @@ repeat: */ BUG_ON(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_cwq(work) == cwq) + if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); /* - * Leave this gcwq. If keep_working() is %true, notify a + * Leave this pool. If keep_working() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ if (keep_working(pool)) wake_up_worker(pool); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } + /* rescuers should never participate in concurrency management */ + WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } @@ -2479,7 +2402,7 @@ static void wq_barrier_func(struct work_struct *work) /** * insert_wq_barrier - insert a barrier work - * @cwq: cwq to insert barrier into + * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing @@ -2496,12 +2419,12 @@ static void wq_barrier_func(struct work_struct *work) * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified - * underneath us, so we can't reliably determine cwq from @target. + * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ -static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, +static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { @@ -2509,7 +2432,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, unsigned int linked = 0; /* - * debugobject calls are safe here even with gcwq->lock locked + * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. @@ -2534,23 +2457,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, } debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head, + insert_work(pwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR) | linked); } /** - * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * - * Prepare cwqs for workqueue flushing. + * Prepare pwqs for workqueue flushing. * - * If @flush_color is non-negative, flush_color on all cwqs should be - * -1. If no cwq has in-flight commands at the specified color, all - * cwq->flush_color's stay at -1 and %false is returned. If any cwq - * has in flight commands, its cwq->flush_color is set to - * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * If @flush_color is non-negative, flush_color on all pwqs should be + * -1. If no pwq has in-flight commands at the specified color, all + * pwq->flush_color's stay at -1 and %false is returned. If any pwq + * has in flight commands, its pwq->flush_color is set to + * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to @@ -2558,7 +2481,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * @flush_color is negative, no flush color update is done and %false * is returned. * - * If @work_color is non-negative, all cwqs should have the same + * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * @@ -2569,42 +2492,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ -static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, +static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; unsigned int cpu; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); - atomic_set(&wq->nr_cwqs_to_flush, 1); + BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->pool->gcwq; + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct worker_pool *pool = pwq->pool; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(cwq->flush_color != -1); + BUG_ON(pwq->flush_color != -1); - if (cwq->nr_in_flight[flush_color]) { - cwq->flush_color = flush_color; - atomic_inc(&wq->nr_cwqs_to_flush); + if (pwq->nr_in_flight[flush_color]) { + pwq->flush_color = flush_color; + atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(cwq->work_color)); - cwq->work_color = work_color; + BUG_ON(work_color != work_next_color(pwq->work_color)); + pwq->work_color = work_color; } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } - if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait; @@ -2655,7 +2578,7 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = &this_flusher; - if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ wq->flush_color = next_color; @@ -2666,7 +2589,7 @@ void flush_workqueue(struct workqueue_struct *wq) /* wait in queue */ BUG_ON(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* @@ -2733,7 +2656,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); - flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { @@ -2743,7 +2666,7 @@ void flush_workqueue(struct workqueue_struct *wq) /* * Need to flush more colors. Make the next flusher - * the new first flusher and arm cwqs. + * the new first flusher and arm pwqs. */ BUG_ON(wq->flush_color == wq->work_color); BUG_ON(wq->flush_color != next->flush_color); @@ -2751,7 +2674,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_del_init(&next->list); wq->first_flusher = next; - if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* @@ -2794,13 +2717,13 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->pool->gcwq->lock); - drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->pool->gcwq->lock); + spin_lock_irq(&pwq->pool->lock); + drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2822,34 +2745,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; - struct global_cwq *gcwq; - struct cpu_workqueue_struct *cwq; + struct worker_pool *pool; + struct pool_workqueue *pwq; might_sleep(); - gcwq = get_work_gcwq(work); - if (!gcwq) + pool = get_work_pool(work); + if (!pool) return false; - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued to a different gcwq under us, we - * are not going to wait. - */ - smp_rmb(); - cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) + spin_lock_irq(&pool->lock); + /* see the comment in try_to_grab_pending() with the same code */ + pwq = get_work_pwq(work); + if (pwq) { + if (unlikely(pwq->pool != pool)) goto already_gone; } else { - worker = find_worker_executing_work(gcwq, work); + worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; - cwq = worker->current_cwq; + pwq = worker->current_pwq; } - insert_wq_barrier(cwq, barr, work, worker); - spin_unlock_irq(&gcwq->lock); + insert_wq_barrier(pwq, barr, work, worker); + spin_unlock_irq(&pool->lock); /* * If @max_active is 1 or rescuer is in use, flushing another work @@ -2857,15 +2775,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) - lock_map_acquire(&cwq->wq->lockdep_map); + if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&pwq->wq->lockdep_map); else - lock_map_acquire_read(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + lock_map_acquire_read(&pwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); return true; already_gone: - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); return false; } @@ -2961,8 +2879,7 @@ bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(dwork->cpu, - get_work_cwq(&dwork->work)->wq, &dwork->work); + __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); } @@ -2992,7 +2909,8 @@ bool cancel_delayed_work(struct delayed_work *dwork) if (unlikely(ret < 0)) return false; - set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + set_work_pool_and_clear_pending(&dwork->work, + get_work_pool_id(&dwork->work)); local_irq_restore(flags); return ret; } @@ -3171,46 +3089,46 @@ int keventd_up(void) return system_wq != NULL; } -static int alloc_cwqs(struct workqueue_struct *wq) +static int alloc_pwqs(struct workqueue_struct *wq) { /* - * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. * Make sure that the alignment isn't lower than that of * unsigned long long. */ - const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t size = sizeof(struct pool_workqueue); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); if (!(wq->flags & WQ_UNBOUND)) - wq->cpu_wq.pcpu = __alloc_percpu(size, align); + wq->pool_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; /* - * Allocate enough room to align cwq and put an extra + * Allocate enough room to align pwq and put an extra * pointer at the end pointing back to the originally * allocated pointer which will be used for free. */ ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); if (ptr) { - wq->cpu_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->cpu_wq.single + 1) = ptr; + wq->pool_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->pool_wq.single + 1) = ptr; } } /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); - return wq->cpu_wq.v ? 0 : -ENOMEM; + BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); + return wq->pool_wq.v ? 0 : -ENOMEM; } -static void free_cwqs(struct workqueue_struct *wq) +static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_wq.pcpu); - else if (wq->cpu_wq.single) { - /* the pointer to free is stored right after the cwq */ - kfree(*(void **)(wq->cpu_wq.single + 1)); + free_percpu(wq->pool_wq.pcpu); + else if (wq->pool_wq.single) { + /* the pointer to free is stored right after the pwq */ + kfree(*(void **)(wq->pool_wq.single + 1)); } } @@ -3264,27 +3182,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); - atomic_set(&wq->nr_cwqs_to_flush, 0); + atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_cwqs(wq) < 0) + if (alloc_pwqs(wq) < 0) goto err; - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = get_gcwq(cpu); - int pool_idx = (bool)(flags & WQ_HIGHPRI); - - BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->pool = &gcwq->pools[pool_idx]; - cwq->wq = wq; - cwq->flush_color = -1; - cwq->max_active = max_active; - INIT_LIST_HEAD(&cwq->delayed_works); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); + pwq->wq = wq; + pwq->flush_color = -1; + pwq->max_active = max_active; + INIT_LIST_HEAD(&pwq->delayed_works); } if (flags & WQ_RESCUER) { @@ -3297,7 +3213,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) goto err; @@ -3314,8 +3231,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_lock(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_cwq_cpu(cpu, wq) - get_cwq(cpu, wq)->max_active = 0; + for_each_pwq_cpu(cpu, wq) + get_pwq(cpu, wq)->max_active = 0; list_add(&wq->list, &workqueues); @@ -3324,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; err: if (wq) { - free_cwqs(wq); + free_pwqs(wq); free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); kfree(wq); @@ -3355,14 +3272,14 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock(&workqueue_lock); /* sanity check */ - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); int i; for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(cwq->nr_in_flight[i]); - BUG_ON(cwq->nr_active); - BUG_ON(!list_empty(&cwq->delayed_works)); + BUG_ON(pwq->nr_in_flight[i]); + BUG_ON(pwq->nr_active); + BUG_ON(!list_empty(&pwq->delayed_works)); } if (wq->flags & WQ_RESCUER) { @@ -3371,29 +3288,29 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(wq->rescuer); } - free_cwqs(wq); + free_pwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); /** - * cwq_set_max_active - adjust max_active of a cwq - * @cwq: target cpu_workqueue_struct + * pwq_set_max_active - adjust max_active of a pwq + * @pwq: target pool_workqueue * @max_active: new max_active value. * - * Set @cwq->max_active to @max_active and activate delayed works if + * Set @pwq->max_active to @max_active and activate delayed works if * increased. * * CONTEXT: - * spin_lock_irq(gcwq->lock). + * spin_lock_irq(pool->lock). */ -static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) { - cwq->max_active = max_active; + pwq->max_active = max_active; - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); } /** @@ -3416,16 +3333,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_cwq_cpu(cpu, wq) { - struct global_cwq *gcwq = get_gcwq(cpu); + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct worker_pool *pool = pwq->pool; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); if (!(wq->flags & WQ_FREEZABLE) || - !(gcwq->flags & GCWQ_FREEZING)) - cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); + !(pool->flags & POOL_FREEZING)) + pwq_set_max_active(pwq, max_active); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } spin_unlock(&workqueue_lock); @@ -3446,57 +3364,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); */ bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - return !list_empty(&cwq->delayed_works); + return !list_empty(&pwq->delayed_works); } EXPORT_SYMBOL_GPL(workqueue_congested); /** - * work_cpu - return the last known associated cpu for @work - * @work: the work of interest - * - * RETURNS: - * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. - */ -unsigned int work_cpu(struct work_struct *work) -{ - struct global_cwq *gcwq = get_work_gcwq(work); - - return gcwq ? gcwq->cpu : WORK_CPU_NONE; -} -EXPORT_SYMBOL_GPL(work_cpu); - -/** * work_busy - test whether a work is currently pending or running * @work: the work to be tested * * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. - * Especially for reentrant wqs, the pending state might hide the - * running state. * * RETURNS: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) { - struct global_cwq *gcwq = get_work_gcwq(work); + struct worker_pool *pool = get_work_pool(work); unsigned long flags; unsigned int ret = 0; - if (!gcwq) - return 0; - - spin_lock_irqsave(&gcwq->lock, flags); - if (work_pending(work)) ret |= WORK_BUSY_PENDING; - if (find_worker_executing_work(gcwq, work)) - ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&gcwq->lock, flags); + if (pool) { + spin_lock_irqsave(&pool->lock, flags); + if (find_worker_executing_work(pool, work)) + ret |= WORK_BUSY_RUNNING; + spin_unlock_irqrestore(&pool->lock, flags); + } return ret; } @@ -3506,65 +3405,48 @@ EXPORT_SYMBOL_GPL(work_busy); * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there - * are a lot of assumptions on strong associations among work, cwq and - * gcwq which make migrating pending and scheduled works very + * are a lot of assumptions on strong associations among work, pwq and + * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, - * gcwqs serve mix of short, long and very long running works making + * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be disassociated from the CPU + * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ -/* claim manager positions of all pools */ -static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) +static void wq_unbind_fn(struct work_struct *work) { - struct worker_pool *pool; - - for_each_worker_pool(pool, gcwq) - mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); - spin_lock_irq(&gcwq->lock); -} - -/* release manager positions */ -static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) -{ - struct worker_pool *pool; - - spin_unlock_irq(&gcwq->lock); - for_each_worker_pool(pool, gcwq) - mutex_unlock(&pool->assoc_mutex); -} - -static void gcwq_unbind_fn(struct work_struct *work) -{ - struct global_cwq *gcwq = get_gcwq(smp_processor_id()); + int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - struct hlist_node *pos; int i; - BUG_ON(gcwq->cpu != smp_processor_id()); + for_each_std_worker_pool(pool, cpu) { + BUG_ON(cpu != smp_processor_id()); - gcwq_claim_assoc_and_lock(gcwq); + mutex_lock(&pool->assoc_mutex); + spin_lock_irq(&pool->lock); - /* - * We've claimed all manager positions. Make all workers unbound - * and set DISASSOCIATED. Before this, all workers except for the - * ones which are still executing works from before the last CPU - * down must be on the cpu. After this, they may become diasporas. - */ - for_each_worker_pool(pool, gcwq) + /* + * We've claimed all manager positions. Make all workers + * unbound and set DISASSOCIATED. Before this, all workers + * except for the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they may become diasporas. + */ list_for_each_entry(worker, &pool->idle_list, entry) worker->flags |= WORKER_UNBOUND; - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_UNBOUND; + for_each_busy_worker(worker, i, pool) + worker->flags |= WORKER_UNBOUND; - gcwq->flags |= GCWQ_DISASSOCIATED; + pool->flags |= POOL_DISASSOCIATED; - gcwq_release_assoc_and_unlock(gcwq); + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->assoc_mutex); + } /* * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3576,16 +3458,16 @@ static void gcwq_unbind_fn(struct work_struct *work) /* * Sched callbacks are disabled now. Zap nr_running. After this, * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. @gcwq now - * behaves as unbound (in terms of concurrency management) gcwq - * which is served by workers tied to the CPU. + * are always true as long as the worklist is not empty. Pools on + * @cpu now behave as unbound (in terms of concurrency management) + * pools which are served by workers tied to the CPU. * * On return from this function, the current worker would trigger * unbound chain execution of pending work items if other workers * didn't already. */ - for_each_worker_pool(pool, gcwq) - atomic_set(get_pool_nr_running(pool), 0); + for_each_std_worker_pool(pool, cpu) + atomic_set(&pool->nr_running, 0); } /* @@ -3597,12 +3479,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct global_cwq *gcwq = get_gcwq(cpu); struct worker_pool *pool; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { struct worker *worker; if (pool->nr_workers) @@ -3612,18 +3493,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, if (!worker) return NOTIFY_BAD; - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } break; case CPU_DOWN_FAILED: case CPU_ONLINE: - gcwq_claim_assoc_and_lock(gcwq); - gcwq->flags &= ~GCWQ_DISASSOCIATED; - rebind_workers(gcwq); - gcwq_release_assoc_and_unlock(gcwq); + for_each_std_worker_pool(pool, cpu) { + mutex_lock(&pool->assoc_mutex); + spin_lock_irq(&pool->lock); + + pool->flags &= ~POOL_DISASSOCIATED; + rebind_workers(pool); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->assoc_mutex); + } break; } return NOTIFY_OK; @@ -3643,7 +3530,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* unbinding should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); flush_work(&unbind_work); break; @@ -3696,10 +3583,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their frozen_works list instead of - * gcwq->worklist. + * pool->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -3710,23 +3597,26 @@ void freeze_workqueues_begin(void) BUG_ON(workqueue_freezing); workqueue_freezing = true; - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); + for_each_wq_cpu(cpu) { + struct worker_pool *pool; struct workqueue_struct *wq; - spin_lock_irq(&gcwq->lock); + for_each_std_worker_pool(pool, cpu) { + spin_lock_irq(&pool->lock); - BUG_ON(gcwq->flags & GCWQ_FREEZING); - gcwq->flags |= GCWQ_FREEZING; + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + list_for_each_entry(wq, &workqueues, list) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (cwq && wq->flags & WQ_FREEZABLE) - cwq->max_active = 0; - } + if (pwq && pwq->pool == pool && + (wq->flags & WQ_FREEZABLE)) + pwq->max_active = 0; + } - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); + } } spin_unlock(&workqueue_lock); @@ -3754,20 +3644,20 @@ bool freeze_workqueues_busy(void) BUG_ON(!workqueue_freezing); - for_each_gcwq_cpu(cpu) { + for_each_wq_cpu(cpu) { struct workqueue_struct *wq; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZABLE)) + if (!pwq || !(wq->flags & WQ_FREEZABLE)) continue; - BUG_ON(cwq->nr_active < 0); - if (cwq->nr_active) { + BUG_ON(pwq->nr_active < 0); + if (pwq->nr_active) { busy = true; goto out_unlock; } @@ -3782,10 +3672,10 @@ out_unlock: * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective gcwq worklists. + * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and gcwq->lock's. + * Grabs and releases workqueue_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -3796,30 +3686,31 @@ void thaw_workqueues(void) if (!workqueue_freezing) goto out_unlock; - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); + for_each_wq_cpu(cpu) { struct worker_pool *pool; struct workqueue_struct *wq; - spin_lock_irq(&gcwq->lock); + for_each_std_worker_pool(pool, cpu) { + spin_lock_irq(&pool->lock); - BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); - gcwq->flags &= ~GCWQ_FREEZING; + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; - list_for_each_entry(wq, &workqueues, list) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + list_for_each_entry(wq, &workqueues, list) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZABLE)) - continue; + if (!pwq || pwq->pool != pool || + !(wq->flags & WQ_FREEZABLE)) + continue; - /* restore max_active and repopulate worklist */ - cwq_set_max_active(cwq, wq->saved_max_active); - } + /* restore max_active and repopulate worklist */ + pwq_set_max_active(pwq, wq->saved_max_active); + } - for_each_worker_pool(pool, gcwq) wake_up_worker(pool); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); + } } workqueue_freezing = false; @@ -3831,60 +3722,56 @@ out_unlock: static int __init init_workqueues(void) { unsigned int cpu; - int i; - /* make sure we have enough bits for OFFQ CPU number */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < - WORK_CPU_LAST); + /* make sure we have enough bits for OFFQ pool ID */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < + WORK_CPU_END * NR_STD_WORKER_POOLS); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); - /* initialize gcwqs */ - for_each_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); + /* initialize CPU pools */ + for_each_wq_cpu(cpu) { struct worker_pool *pool; - spin_lock_init(&gcwq->lock); - gcwq->cpu = cpu; - gcwq->flags |= GCWQ_DISASSOCIATED; - - for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) - INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - - for_each_worker_pool(pool, gcwq) { - pool->gcwq = gcwq; + for_each_std_worker_pool(pool, cpu) { + spin_lock_init(&pool->lock); + pool->cpu = cpu; + pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); init_timer_deferrable(&pool->idle_timer); pool->idle_timer.function = idle_worker_timeout; pool->idle_timer.data = (unsigned long)pool; - setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + + /* alloc pool ID */ + BUG_ON(worker_pool_assign_id(pool)); } } /* create the initial worker */ - for_each_online_gcwq_cpu(cpu) { - struct global_cwq *gcwq = get_gcwq(cpu); + for_each_online_wq_cpu(cpu) { struct worker_pool *pool; - if (cpu != WORK_CPU_UNBOUND) - gcwq->flags &= ~GCWQ_DISASSOCIATED; - - for_each_worker_pool(pool, gcwq) { + for_each_std_worker_pool(pool, cpu) { struct worker *worker; + if (cpu != WORK_CPU_UNBOUND) + pool->flags &= ~POOL_DISASSOCIATED; + worker = create_worker(pool); BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); + spin_lock_irq(&pool->lock); start_worker(worker); - spin_unlock_irq(&gcwq->lock); + spin_unlock_irq(&pool->lock); } } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h new file mode 100644 index 000000000000..07650264ec15 --- /dev/null +++ b/kernel/workqueue_internal.h @@ -0,0 +1,65 @@ +/* + * kernel/workqueue_internal.h + * + * Workqueue internal header file. Only to be included by workqueue and + * core kernel subsystems. + */ +#ifndef _KERNEL_WORKQUEUE_INTERNAL_H +#define _KERNEL_WORKQUEUE_INTERNAL_H + +#include <linux/workqueue.h> +#include <linux/kthread.h> + +struct worker_pool; + +/* + * The poor guys doing the actual heavy lifting. All on-duty workers are + * either serving the manager role, on idle list or on busy hash. For + * details on the locking annotation (L, I, X...), refer to workqueue.c. + * + * Only to be used in workqueue and async. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; + + struct work_struct *current_work; /* L: work being processed */ + work_func_t current_func; /* L: current_work's fn */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct worker_pool *pool; /* I: the associated pool */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + + /* for rebinding worker to CPU */ + struct work_struct rebind_work; /* L: for busy worker */ + + /* used only by rescuers to point to the target workqueue */ + struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ +}; + +/** + * current_wq_worker - return struct worker if %current is a workqueue worker + */ +static inline struct worker *current_wq_worker(void) +{ + if (current->flags & PF_WQ_WORKER) + return kthread_data(current); + return NULL; +} + +/* + * Scheduler hooks for concurrency managed workqueue. Only to be used from + * sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); + +#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h deleted file mode 100644 index 2d10fc98dc79..000000000000 --- a/kernel/workqueue_sched.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * kernel/workqueue_sched.h - * - * Scheduler hooks for concurrency managed workqueue. Only to be - * included from sched.c and workqueue.c. - */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); |